aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpineappleEA <pineaea@gmail.com>2024-02-09 20:23:03 +0100
committerpineappleEA <pineaea@gmail.com>2024-02-09 20:23:03 +0100
commitb48b6e3b79c09c46384627ce7ae47a81f77187b8 (patch)
treebdcb00d2a9fba0a7580fcfe5ebf282814eb48841
parent5a87b5c4005aa344b56ef9cd7eeb2d11d8c03d93 (diff)
early-access version 4125EA-4125
-rwxr-xr-xREADME.md2
-rwxr-xr-xexternals/CMakeLists.txt7
-rwxr-xr-xexternals/sse2neon/sse2neon.h9282
-rwxr-xr-xsrc/android/app/src/main/AndroidManifest.xml6
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt12
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt17
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt3
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt18
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt13
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt23
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt13
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt23
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt6
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt55
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt24
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt14
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt42
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt7
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt13
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt13
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt18
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt15
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt9
-rwxr-xr-xsrc/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt25
-rwxr-xr-xsrc/android/app/src/main/jni/CMakeLists.txt6
-rwxr-xr-xsrc/android/app/src/main/jni/android_settings.h2
-rwxr-xr-xsrc/android/app/src/main/jni/emu_window/emu_window.cpp4
-rwxr-xr-xsrc/android/app/src/main/jni/game_metadata.cpp22
-rwxr-xr-xsrc/android/app/src/main/jni/native.cpp125
-rwxr-xr-xsrc/android/app/src/main/jni/native.h6
-rwxr-xr-xsrc/android/app/src/main/jni/native_config.cpp121
-rwxr-xr-xsrc/android/app/src/main/jni/native_log.cpp13
-rwxr-xr-xsrc/android/app/src/main/res/layout/fragment_emulation.xml15
-rwxr-xr-xsrc/android/app/src/main/res/menu/menu_overlay_options.xml5
-rwxr-xr-xsrc/android/app/src/main/res/values-ar/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-ckb/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-cs/strings.xml1
-rwxr-xr-xsrc/android/app/src/main/res/values-de/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-es/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-fr/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-he/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-hu/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-it/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-ja/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-ko/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-nb/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-pl/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-pt-rBR/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-pt-rPT/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-ru/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-uk/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-vi/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-zh-rCN/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values-zh-rTW/strings.xml3
-rwxr-xr-xsrc/android/app/src/main/res/values/strings.xml5
-rwxr-xr-xsrc/common/CMakeLists.txt8
-rwxr-xr-xsrc/common/android/android_common.cpp65
-rwxr-xr-xsrc/common/android/android_common.h26
-rwxr-xr-xsrc/common/android/applets/software_keyboard.cpp277
-rwxr-xr-xsrc/common/android/applets/software_keyboard.h78
-rwxr-xr-xsrc/common/android/id_cache.cpp428
-rwxr-xr-xsrc/common/android/id_cache.h88
-rwxr-xr-xsrc/common/fs/fs_android.cpp167
-rwxr-xr-xsrc/common/fs/fs_android.h58
-rwxr-xr-xsrc/core/file_sys/content_archive.cpp4
-rwxr-xr-xsrc/core/file_sys/content_archive.h1
-rwxr-xr-xsrc/core/hle/service/am/library_applet_creator.cpp8
-rwxr-xr-xsrc/core/hle/service/am/process.cpp23
-rwxr-xr-xsrc/core/hle/service/am/process.h2
-rwxr-xr-xsrc/video_core/CMakeLists.txt4
-rwxr-xr-xsrc/video_core/host1x/vic.cpp23
71 files changed, 10736 insertions, 566 deletions
diff --git a/README.md b/README.md
index c164deec9..2950656e4 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
1yuzu emulator early access 1yuzu emulator early access
2============= 2=============
3 3
4This is the source code for early-access 4124. 4This is the source code for early-access 4125.
5 5
6## Legal Notice 6## Legal Notice
7 7
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 836aed14f..9693edcb5 100755
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -314,3 +314,10 @@ endif()
314if (NOT TARGET SimpleIni::SimpleIni) 314if (NOT TARGET SimpleIni::SimpleIni)
315 add_subdirectory(simpleini) 315 add_subdirectory(simpleini)
316endif() 316endif()
317
318# sse2neon
319if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon)
320 add_library(sse2neon INTERFACE)
321 target_include_directories(sse2neon INTERFACE sse2neon)
322endif()
323
diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h
new file mode 100755
index 000000000..56254b5f9
--- /dev/null
+++ b/externals/sse2neon/sse2neon.h
@@ -0,0 +1,9282 @@
1#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4/*
5 * sse2neon is freely redistributable under the MIT License.
6 *
7 * Copyright (c) 2015-2024 SSE2NEON Contributors.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * SOFTWARE.
26 */
27
28// This header file provides a simple API translation layer
29// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
30//
31// Contributors to this work are:
32// John W. Ratcliff <jratcliffscarab@gmail.com>
33// Brandon Rowlett <browlett@nvidia.com>
34// Ken Fast <kfast@gdeb.com>
35// Eric van Beurden <evanbeurden@nvidia.com>
36// Alexander Potylitsin <apotylitsin@nvidia.com>
37// Hasindu Gamaarachchi <hasindu2008@gmail.com>
38// Jim Huang <jserv@ccns.ncku.edu.tw>
39// Mark Cheng <marktwtn@gmail.com>
40// Malcolm James MacLeod <malcolm@gulden.com>
41// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
42// Sebastian Pop <spop@amazon.com>
43// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
44// Danila Kutenin <danilak@google.com>
45// François Turban (JishinMaster) <francois.turban@gmail.com>
46// Pei-Hsuan Hung <afcidk@gmail.com>
47// Yang-Hao Yuan <yuanyanghau@gmail.com>
48// Syoyo Fujita <syoyo@lighttransport.com>
49// Brecht Van Lommel <brecht@blender.org>
50// Jonathan Hue <jhue@adobe.com>
51// Cuda Chen <clh960524@gmail.com>
52// Aymen Qader <aymen.qader@arm.com>
53// Anthony Roberts <anthony.roberts@linaro.org>
54
55/* Tunable configurations */
56
57/* Enable precise implementation of math operations
58 * This would slow down the computation a bit, but gives consistent result with
59 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
60 */
61/* _mm_min|max_ps|ss|pd|sd */
62#ifndef SSE2NEON_PRECISE_MINMAX
63#define SSE2NEON_PRECISE_MINMAX (0)
64#endif
65/* _mm_rcp_ps and _mm_div_ps */
66#ifndef SSE2NEON_PRECISE_DIV
67#define SSE2NEON_PRECISE_DIV (0)
68#endif
69/* _mm_sqrt_ps and _mm_rsqrt_ps */
70#ifndef SSE2NEON_PRECISE_SQRT
71#define SSE2NEON_PRECISE_SQRT (0)
72#endif
73/* _mm_dp_pd */
74#ifndef SSE2NEON_PRECISE_DP
75#define SSE2NEON_PRECISE_DP (0)
76#endif
77
78/* Enable inclusion of windows.h on MSVC platforms
79 * This makes _mm_clflush functional on windows, as there is no builtin.
80 */
81#ifndef SSE2NEON_INCLUDE_WINDOWS_H
82#define SSE2NEON_INCLUDE_WINDOWS_H (0)
83#endif
84
85/* compiler specific definitions */
86#if defined(__GNUC__) || defined(__clang__)
87#pragma push_macro("FORCE_INLINE")
88#pragma push_macro("ALIGN_STRUCT")
89#define FORCE_INLINE static inline __attribute__((always_inline))
90#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
91#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
92#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
93#elif defined(_MSC_VER)
94#if _MSVC_TRADITIONAL
95#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
96#endif
97#ifndef FORCE_INLINE
98#define FORCE_INLINE static inline
99#endif
100#ifndef ALIGN_STRUCT
101#define ALIGN_STRUCT(x) __declspec(align(x))
102#endif
103#define _sse2neon_likely(x) (x)
104#define _sse2neon_unlikely(x) (x)
105#else
106#pragma message("Macro name collisions may happen with unsupported compilers.")
107#endif
108
109#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
110#warning "GCC versions earlier than 10 are not supported."
111#endif
112
113/* C language does not allow initializing a variable with a function call. */
114#ifdef __cplusplus
115#define _sse2neon_const static const
116#else
117#define _sse2neon_const const
118#endif
119
120#include <stdint.h>
121#include <stdlib.h>
122
123#if defined(_WIN32)
124/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
125 * from both MinGW-w64 and MSVC.
126 */
127#define SSE2NEON_ALLOC_DEFINED
128#endif
129
130/* If using MSVC */
131#ifdef _MSC_VER
132#include <intrin.h>
133#if SSE2NEON_INCLUDE_WINDOWS_H
134#include <processthreadsapi.h>
135#include <windows.h>
136#endif
137
138#if !defined(__cplusplus)
139#error SSE2NEON only supports C++ compilation with this compiler
140#endif
141
142#ifdef SSE2NEON_ALLOC_DEFINED
143#include <malloc.h>
144#endif
145
146#if (defined(_M_AMD64) || defined(__x86_64__)) || \
147 (defined(_M_ARM64) || defined(__arm64__))
148#define SSE2NEON_HAS_BITSCAN64
149#endif
150#endif
151
152#if defined(__GNUC__) || defined(__clang__)
153#define _sse2neon_define0(type, s, body) \
154 __extension__({ \
155 type _a = (s); \
156 body \
157 })
158#define _sse2neon_define1(type, s, body) \
159 __extension__({ \
160 type _a = (s); \
161 body \
162 })
163#define _sse2neon_define2(type, a, b, body) \
164 __extension__({ \
165 type _a = (a), _b = (b); \
166 body \
167 })
168#define _sse2neon_return(ret) (ret)
169#else
170#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
171#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
172#define _sse2neon_define2(type, a, b, body) \
173 [](type _a, type _b) { body }((a), (b))
174#define _sse2neon_return(ret) return ret
175#endif
176
177#define _sse2neon_init(...) \
178 { \
179 __VA_ARGS__ \
180 }
181
182/* Compiler barrier */
183#if defined(_MSC_VER)
184#define SSE2NEON_BARRIER() _ReadWriteBarrier()
185#else
186#define SSE2NEON_BARRIER() \
187 do { \
188 __asm__ __volatile__("" ::: "memory"); \
189 (void) 0; \
190 } while (0)
191#endif
192
193/* Memory barriers
194 * __atomic_thread_fence does not include a compiler barrier; instead,
195 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
196 * semantics.
197 */
198#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
199#include <stdatomic.h>
200#endif
201
202FORCE_INLINE void _sse2neon_smp_mb(void)
203{
204 SSE2NEON_BARRIER();
205#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
206 !defined(__STDC_NO_ATOMICS__)
207 atomic_thread_fence(memory_order_seq_cst);
208#elif defined(__GNUC__) || defined(__clang__)
209 __atomic_thread_fence(__ATOMIC_SEQ_CST);
210#else /* MSVC */
211 __dmb(_ARM64_BARRIER_ISH);
212#endif
213}
214
215/* Architecture-specific build options */
216/* FIXME: #pragma GCC push_options is only available on GCC */
217#if defined(__GNUC__)
218#if defined(__arm__) && __ARM_ARCH == 7
219/* According to ARM C Language Extensions Architecture specification,
220 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
221 * architecture supported.
222 */
223#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
224#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
225#endif
226#if !defined(__clang__)
227#pragma GCC push_options
228#pragma GCC target("fpu=neon")
229#endif
230#elif defined(__aarch64__) || defined(_M_ARM64)
231#if !defined(__clang__) && !defined(_MSC_VER)
232#pragma GCC push_options
233#pragma GCC target("+simd")
234#endif
235#elif __ARM_ARCH == 8
236#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
237#error \
238 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
239#endif
240#if !defined(__clang__) && !defined(_MSC_VER)
241#pragma GCC push_options
242#endif
243#else
244#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
245#endif
246#endif
247
248#include <arm_neon.h>
249#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
250#if defined __has_include && __has_include(<arm_acle.h>)
251#include <arm_acle.h>
252#endif
253#endif
254
255/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
256 * and other Arm microarchitectures use.
257 * From sysctl -a on Apple M1:
258 * hw.cachelinesize: 128
259 */
260#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
261#define SSE2NEON_CACHELINE_SIZE 128
262#else
263#define SSE2NEON_CACHELINE_SIZE 64
264#endif
265
266/* Rounding functions require either Aarch64 instructions or libm fallback */
267#if !defined(__aarch64__) && !defined(_M_ARM64)
268#include <math.h>
269#endif
270
271/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
272 * or even not accessible in user mode.
273 * To write or access to these registers in user mode,
274 * we have to perform syscall instead.
275 */
276#if (!defined(__aarch64__) && !defined(_M_ARM64))
277#include <sys/time.h>
278#endif
279
280/* "__has_builtin" can be used to query support for built-in functions
281 * provided by gcc/clang and other compilers that support it.
282 */
283#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
284/* Compatibility with gcc <= 9 */
285#if defined(__GNUC__) && (__GNUC__ <= 9)
286#define __has_builtin(x) HAS##x
287#define HAS__builtin_popcount 1
288#define HAS__builtin_popcountll 1
289
290// __builtin_shuffle introduced in GCC 4.7.0
291#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
292#define HAS__builtin_shuffle 1
293#else
294#define HAS__builtin_shuffle 0
295#endif
296
297#define HAS__builtin_shufflevector 0
298#define HAS__builtin_nontemporal_store 0
299#else
300#define __has_builtin(x) 0
301#endif
302#endif
303
304/**
305 * MACRO for shuffle parameter for _mm_shuffle_ps().
306 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
307 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
308 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
309 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
310 * fp0 is the same for fp0 of result.
311 */
312#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
313 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
314
315#if __has_builtin(__builtin_shufflevector)
316#define _sse2neon_shuffle(type, a, b, ...) \
317 __builtin_shufflevector(a, b, __VA_ARGS__)
318#elif __has_builtin(__builtin_shuffle)
319#define _sse2neon_shuffle(type, a, b, ...) \
320 __extension__({ \
321 type tmp = {__VA_ARGS__}; \
322 __builtin_shuffle(a, b, tmp); \
323 })
324#endif
325
326#ifdef _sse2neon_shuffle
327#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
328#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
329#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
330#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
331#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
332#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
333#endif
334
335/* Rounding mode macros. */
336#define _MM_FROUND_TO_NEAREST_INT 0x00
337#define _MM_FROUND_TO_NEG_INF 0x01
338#define _MM_FROUND_TO_POS_INF 0x02
339#define _MM_FROUND_TO_ZERO 0x03
340#define _MM_FROUND_CUR_DIRECTION 0x04
341#define _MM_FROUND_NO_EXC 0x08
342#define _MM_FROUND_RAISE_EXC 0x00
343#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
344#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
345#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
346#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
347#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
348#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
349#define _MM_ROUND_NEAREST 0x0000
350#define _MM_ROUND_DOWN 0x2000
351#define _MM_ROUND_UP 0x4000
352#define _MM_ROUND_TOWARD_ZERO 0x6000
353/* Flush zero mode macros. */
354#define _MM_FLUSH_ZERO_MASK 0x8000
355#define _MM_FLUSH_ZERO_ON 0x8000
356#define _MM_FLUSH_ZERO_OFF 0x0000
357/* Denormals are zeros mode macros. */
358#define _MM_DENORMALS_ZERO_MASK 0x0040
359#define _MM_DENORMALS_ZERO_ON 0x0040
360#define _MM_DENORMALS_ZERO_OFF 0x0000
361
362/* indicate immediate constant argument in a given range */
363#define __constrange(a, b) const
364
365/* A few intrinsics accept traditional data types like ints or floats, but
366 * most operate on data types that are specific to SSE.
367 * If a vector type ends in d, it contains doubles, and if it does not have
368 * a suffix, it contains floats. An integer vector type can contain any type
369 * of integer, from chars to shorts to unsigned long longs.
370 */
371typedef int64x1_t __m64;
372typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
373// On ARM 32-bit architecture, the float64x2_t is not supported.
374// The data type __m128d should be represented in a different way for related
375// intrinsic conversion.
376#if defined(__aarch64__) || defined(_M_ARM64)
377typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
378#else
379typedef float32x4_t __m128d;
380#endif
381typedef int64x2_t __m128i; /* 128-bit vector containing integers */
382
383// __int64 is defined in the Intrinsics Guide which maps to different datatype
384// in different data model
385#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
386#if (defined(__x86_64__) || defined(__i386__))
387#define __int64 long long
388#else
389#define __int64 int64_t
390#endif
391#endif
392
393/* type-safe casting between types */
394
395#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
396#define vreinterpretq_m128_f32(x) (x)
397#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
398
399#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
400#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
401#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
402#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
403
404#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
405#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
406#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
407#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
408
409#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
410#define vreinterpretq_f32_m128(x) (x)
411#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
412
413#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
414#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
415#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
416#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
417
418#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
419#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
420#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
421#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
422
423#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
424#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
425#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
426#define vreinterpretq_m128i_s64(x) (x)
427
428#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
429#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
430#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
431#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
432
433#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
434#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
435
436#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
437#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
438#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
439#define vreinterpretq_s64_m128i(x) (x)
440
441#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
442#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
443#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
444#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
445
446#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
447#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
448#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
449#define vreinterpret_m64_s64(x) (x)
450
451#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
452#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
453#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
454#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
455
456#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
457#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
458#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
459
460#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
461#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
462#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
463#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
464
465#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
466#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
467#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
468#define vreinterpret_s64_m64(x) (x)
469
470#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
471
472#if defined(__aarch64__) || defined(_M_ARM64)
473#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
474#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
475
476#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
477
478#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
479#define vreinterpretq_m128d_f64(x) (x)
480
481#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
482
483#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
484#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
485
486#define vreinterpretq_f64_m128d(x) (x)
487#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
488#else
489#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
490#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
491
492#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
493#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
494
495#define vreinterpretq_m128d_f32(x) (x)
496
497#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
498
499#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
500#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
501
502#define vreinterpretq_f32_m128d(x) (x)
503#endif
504
505// A struct is defined in this header file called 'SIMDVec' which can be used
506// by applications which attempt to access the contents of an __m128 struct
507// directly. It is important to note that accessing the __m128 struct directly
508// is bad coding practice by Microsoft: @see:
509// https://learn.microsoft.com/en-us/cpp/cpp/m128
510//
511// However, some legacy source code may try to access the contents of an __m128
512// struct directly so the developer can use the SIMDVec as an alias for it. Any
513// casting must be done manually by the developer, as you cannot cast or
514// otherwise alias the base NEON data type for intrinsic operations.
515//
516// union intended to allow direct access to an __m128 variable using the names
517// that the MSVC compiler provides. This union should really only be used when
518// trying to access the members of the vector as integer values. GCC/clang
519// allow native access to the float members through a simple array access
520// operator (in C since 4.6, in C++ since 4.8).
521//
522// Ideally direct accesses to SIMD vectors should not be used since it can cause
523// a performance hit. If it really is needed however, the original __m128
524// variable can be aliased with a pointer to this union and used to access
525// individual components. The use of this union should be hidden behind a macro
526// that is used throughout the codebase to access the members instead of always
527// declaring this type of variable.
528typedef union ALIGN_STRUCT(16) SIMDVec {
529 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
530 int8_t m128_i8[16]; // as signed 8-bit integers.
531 int16_t m128_i16[8]; // as signed 16-bit integers.
532 int32_t m128_i32[4]; // as signed 32-bit integers.
533 int64_t m128_i64[2]; // as signed 64-bit integers.
534 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
535 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
536 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
537 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
538} SIMDVec;
539
540// casting using SIMDVec
541#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
542#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
543#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
544
545/* SSE macros */
546#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
547#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
548#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
549#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
550
551// Function declaration
552// SSE
553FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
554FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
555FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
556FORCE_INLINE __m128 _mm_set_ps1(float);
557FORCE_INLINE __m128 _mm_setzero_ps(void);
558// SSE2
559FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
560FORCE_INLINE __m128i _mm_castps_si128(__m128);
561FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
562FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
563FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
564FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
565FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
566FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
567FORCE_INLINE __m128d _mm_set_pd(double, double);
568FORCE_INLINE __m128i _mm_set1_epi32(int);
569FORCE_INLINE __m128i _mm_setzero_si128(void);
570// SSE4.1
571FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
572FORCE_INLINE __m128 _mm_ceil_ps(__m128);
573FORCE_INLINE __m128d _mm_floor_pd(__m128d);
574FORCE_INLINE __m128 _mm_floor_ps(__m128);
575FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
576FORCE_INLINE __m128 _mm_round_ps(__m128, int);
577// SSE4.2
578FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
579
580/* Backwards compatibility for compilers with lack of specific type support */
581
582// Older gcc does not define vld1q_u8_x4 type
583#if defined(__GNUC__) && !defined(__clang__) && \
584 ((__GNUC__ <= 13 && defined(__arm__)) || \
585 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
586 (__GNUC__ <= 9 && defined(__aarch64__)))
587FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
588{
589 uint8x16x4_t ret;
590 ret.val[0] = vld1q_u8(p + 0);
591 ret.val[1] = vld1q_u8(p + 16);
592 ret.val[2] = vld1q_u8(p + 32);
593 ret.val[3] = vld1q_u8(p + 48);
594 return ret;
595}
596#else
597// Wraps vld1q_u8_x4
598FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
599{
600 return vld1q_u8_x4(p);
601}
602#endif
603
604#if !defined(__aarch64__) && !defined(_M_ARM64)
605/* emulate vaddv u8 variant */
606FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
607{
608 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
609 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
610}
611#else
612// Wraps vaddv_u8
613FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
614{
615 return vaddv_u8(v8);
616}
617#endif
618
619#if !defined(__aarch64__) && !defined(_M_ARM64)
620/* emulate vaddvq u8 variant */
621FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
622{
623 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
624 uint8_t res = 0;
625 for (int i = 0; i < 8; ++i)
626 res += tmp[i];
627 return res;
628}
629#else
630// Wraps vaddvq_u8
631FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
632{
633 return vaddvq_u8(a);
634}
635#endif
636
637#if !defined(__aarch64__) && !defined(_M_ARM64)
638/* emulate vaddvq u16 variant */
639FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
640{
641 uint32x4_t m = vpaddlq_u16(a);
642 uint64x2_t n = vpaddlq_u32(m);
643 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
644
645 return vget_lane_u32((uint32x2_t) o, 0);
646}
647#else
648// Wraps vaddvq_u16
649FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
650{
651 return vaddvq_u16(a);
652}
653#endif
654
655/* Function Naming Conventions
656 * The naming convention of SSE intrinsics is straightforward. A generic SSE
657 * intrinsic function is given as follows:
658 * _mm_<name>_<data_type>
659 *
660 * The parts of this format are given as follows:
661 * 1. <name> describes the operation performed by the intrinsic
662 * 2. <data_type> identifies the data type of the function's primary arguments
663 *
664 * This last part, <data_type>, is a little complicated. It identifies the
665 * content of the input values, and can be set to any of the following values:
666 * + ps - vectors contain floats (ps stands for packed single-precision)
667 * + pd - vectors contain doubles (pd stands for packed double-precision)
668 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
669 * signed integers
670 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
671 * unsigned integers
672 * + si128 - unspecified 128-bit vector or 256-bit vector
673 * + m128/m128i/m128d - identifies input vector types when they are different
674 * than the type of the returned vector
675 *
676 * For example, _mm_setzero_ps. The _mm implies that the function returns
677 * a 128-bit vector. The _ps at the end implies that the argument vectors
678 * contain floats.
679 *
680 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
681 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
682 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
683 * // Set packed 8-bit integers
684 * // 128 bits, 16 chars, per 8 bits
685 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
686 * 4, 5, 12, 13, 6, 7, 14, 15);
687 * // Shuffle packed 8-bit integers
688 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
689 */
690
691/* Constants for use with _mm_prefetch. */
692enum _mm_hint {
693 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
694 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
695 _MM_HINT_T1 = 2, /* load data to L2 cache only */
696 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
697};
698
699// The bit field mapping to the FPCR(floating-point control register)
700typedef struct {
701 uint16_t res0;
702 uint8_t res1 : 6;
703 uint8_t bit22 : 1;
704 uint8_t bit23 : 1;
705 uint8_t bit24 : 1;
706 uint8_t res2 : 7;
707#if defined(__aarch64__) || defined(_M_ARM64)
708 uint32_t res3;
709#endif
710} fpcr_bitfield;
711
712// Takes the upper 64 bits of a and places it in the low end of the result
713// Takes the lower 64 bits of b and places it into the high end of the result.
714FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
715{
716 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
717 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
718 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
719}
720
721// takes the lower two 32-bit values from a and swaps them and places in high
722// end of result takes the higher two 32 bit values from b and swaps them and
723// places in low end of result.
724FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
725{
726 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
727 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
728 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
729}
730
731FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
732{
733 float32x2_t a21 = vget_high_f32(
734 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
735 float32x2_t b03 = vget_low_f32(
736 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
737 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
738}
739
740FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
741{
742 float32x2_t a03 = vget_low_f32(
743 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
744 float32x2_t b21 = vget_high_f32(
745 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
746 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
747}
748
749FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
750{
751 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
752 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
753 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
754}
755
756FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
757{
758 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
759 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
760 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
761}
762
763FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
764{
765 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
766 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
767 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
768}
769
770// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
771// high
772FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
773{
774 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
775 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
776 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
777}
778
779FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
780{
781 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
782 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
783 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
784}
785
786FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
787{
788 float32x2_t a22 =
789 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
790 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
791 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
792}
793
794FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
795{
796 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
797 float32x2_t b22 =
798 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
799 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
800}
801
802FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
803{
804 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
805 float32x2_t a22 =
806 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
807 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
808 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
809 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
810}
811
812FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
813{
814 float32x2_t a33 =
815 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
816 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
817 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
818}
819
820FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
821{
822 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
823 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
824 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
825 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
826 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
827}
828
829FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
830{
831 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
832 float32_t b2 = vgetq_lane_f32(b, 2);
833 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
834 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
835 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
836}
837
838FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
839{
840 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
841 float32_t b2 = vgetq_lane_f32(b, 2);
842 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
843 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
844 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
845}
846
847// For MSVC, we check only if it is ARM64, as every single ARM64 processor
848// supported by WoA has crypto extensions. If this changes in the future,
849// this can be verified via the runtime-only method of:
850// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
851#if (defined(_M_ARM64) && !defined(__clang__)) || \
852 (defined(__ARM_FEATURE_CRYPTO) && \
853 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
854// Wraps vmull_p64
855FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
856{
857 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
858 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
859#if defined(_MSC_VER)
860 __n64 a1 = {a}, b1 = {b};
861 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
862#else
863 return vreinterpretq_u64_p128(vmull_p64(a, b));
864#endif
865}
866#else // ARMv7 polyfill
867// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
868//
869// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
870// 64-bit->128-bit polynomial multiply.
871//
872// It needs some work and is somewhat slow, but it is still faster than all
873// known scalar methods.
874//
875// Algorithm adapted to C from
876// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
877// from "Fast Software Polynomial Multiplication on ARM Processors Using the
878// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
879// (https://hal.inria.fr/hal-01506572)
880static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
881{
882 poly8x8_t a = vreinterpret_p8_u64(_a);
883 poly8x8_t b = vreinterpret_p8_u64(_b);
884
885 // Masks
886 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
887 vcreate_u8(0x00000000ffffffff));
888 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
889 vcreate_u8(0x0000000000000000));
890
891 // Do the multiplies, rotating with vext to get all combinations
892 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
893 uint8x16_t e =
894 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
895 uint8x16_t f =
896 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
897 uint8x16_t g =
898 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
899 uint8x16_t h =
900 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
901 uint8x16_t i =
902 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
903 uint8x16_t j =
904 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
905 uint8x16_t k =
906 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
907
908 // Add cross products
909 uint8x16_t l = veorq_u8(e, f); // L = E + F
910 uint8x16_t m = veorq_u8(g, h); // M = G + H
911 uint8x16_t n = veorq_u8(i, j); // N = I + J
912
913 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
914 // instructions.
915#if defined(__aarch64__)
916 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
917 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
918 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
919 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
920 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
921 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
922 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
923 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
924#else
925 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
926 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
927 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
928 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
929#endif
930 // t0 = (L) (P0 + P1) << 8
931 // t1 = (M) (P2 + P3) << 16
932 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
933 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
934 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
935
936 // t2 = (N) (P4 + P5) << 24
937 // t3 = (K) (P6 + P7) << 32
938 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
939 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
940 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
941
942 // De-interleave
943#if defined(__aarch64__)
944 uint8x16_t t0 = vreinterpretq_u8_u64(
945 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
946 uint8x16_t t1 = vreinterpretq_u8_u64(
947 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
948 uint8x16_t t2 = vreinterpretq_u8_u64(
949 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
950 uint8x16_t t3 = vreinterpretq_u8_u64(
951 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
952#else
953 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
954 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
955 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
956 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
957#endif
958 // Shift the cross products
959 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
960 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
961 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
962 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
963
964 // Accumulate the products
965 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
966 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
967 uint8x16_t mix = veorq_u8(d, cross1);
968 uint8x16_t r = veorq_u8(mix, cross2);
969 return vreinterpretq_u64_u8(r);
970}
971#endif // ARMv7 polyfill
972
973// C equivalent:
974// __m128i _mm_shuffle_epi32_default(__m128i a,
975// __constrange(0, 255) int imm) {
976// __m128i ret;
977// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
978// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
979// return ret;
980// }
981#define _mm_shuffle_epi32_default(a, imm) \
982 vreinterpretq_m128i_s32(vsetq_lane_s32( \
983 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
984 vsetq_lane_s32( \
985 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
986 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
987 ((imm) >> 2) & 0x3), \
988 vmovq_n_s32(vgetq_lane_s32( \
989 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
990 1), \
991 2), \
992 3))
993
994// Takes the upper 64 bits of a and places it in the low end of the result
995// Takes the lower 64 bits of a and places it into the high end of the result.
996FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
997{
998 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
999 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1000 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1001}
1002
1003// takes the lower two 32-bit values from a and swaps them and places in low end
1004// of result takes the higher two 32 bit values from a and swaps them and places
1005// in high end of result.
1006FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1007{
1008 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1009 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1010 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1011}
1012
1013// rotates the least significant 32 bits into the most significant 32 bits, and
1014// shifts the rest down
1015FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1016{
1017 return vreinterpretq_m128i_s32(
1018 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1019}
1020
1021// rotates the most significant 32 bits into the least significant 32 bits, and
1022// shifts the rest up
1023FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1024{
1025 return vreinterpretq_m128i_s32(
1026 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1027}
1028
1029// gets the lower 64 bits of a, and places it in the upper 64 bits
1030// gets the lower 64 bits of a and places it in the lower 64 bits
1031FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1032{
1033 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1034 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1035}
1036
1037// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1038// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1039FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1040{
1041 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1042 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1043 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1044}
1045
1046// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1047// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1048// places it in the lower 64 bits
1049FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1050{
1051 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1052 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1053}
1054
1055FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1056{
1057 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1058 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1059 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1060}
1061
1062FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1063{
1064 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1065 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1066 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1067}
1068
1069FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1070{
1071 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1072 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1073 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1074}
1075
1076#if defined(__aarch64__) || defined(_M_ARM64)
1077#define _mm_shuffle_epi32_splat(a, imm) \
1078 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1079#else
1080#define _mm_shuffle_epi32_splat(a, imm) \
1081 vreinterpretq_m128i_s32( \
1082 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1083#endif
1084
1085// NEON does not support a general purpose permute intrinsic.
1086// Shuffle single-precision (32-bit) floating-point elements in a using the
1087// control in imm8, and store the results in dst.
1088//
1089// C equivalent:
1090// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1091// __constrange(0, 255) int imm) {
1092// __m128 ret;
1093// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1094// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1095// return ret;
1096// }
1097//
1098// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1099#define _mm_shuffle_ps_default(a, b, imm) \
1100 vreinterpretq_m128_f32(vsetq_lane_f32( \
1101 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1102 vsetq_lane_f32( \
1103 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1104 vsetq_lane_f32( \
1105 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1106 vmovq_n_f32( \
1107 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1108 1), \
1109 2), \
1110 3))
1111
1112// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1113// Store the results in the low 64 bits of dst, with the high 64 bits being
1114// copied from a to dst.
1115// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1116#define _mm_shufflelo_epi16_function(a, imm) \
1117 _sse2neon_define1( \
1118 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1119 int16x4_t lowBits = vget_low_s16(ret); \
1120 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1121 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1122 1); \
1123 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1124 2); \
1125 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1126 3); \
1127 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1128
1129// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1130// Store the results in the high 64 bits of dst, with the low 64 bits being
1131// copied from a to dst.
1132// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1133#define _mm_shufflehi_epi16_function(a, imm) \
1134 _sse2neon_define1( \
1135 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1136 int16x4_t highBits = vget_high_s16(ret); \
1137 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1138 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1139 5); \
1140 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1141 6); \
1142 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1143 7); \
1144 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1145
1146/* MMX */
1147
1148//_mm_empty is a no-op on arm
1149FORCE_INLINE void _mm_empty(void) {}
1150
1151/* SSE */
1152
1153// Add packed single-precision (32-bit) floating-point elements in a and b, and
1154// store the results in dst.
1155// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1156FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1157{
1158 return vreinterpretq_m128_f32(
1159 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1160}
1161
1162// Add the lower single-precision (32-bit) floating-point element in a and b,
1163// store the result in the lower element of dst, and copy the upper 3 packed
1164// elements from a to the upper elements of dst.
1165// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1166FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1167{
1168 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1169 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1170 // the upper values in the result must be the remnants of <a>.
1171 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1172}
1173
1174// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1175// elements in a and b, and store the results in dst.
1176// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1177FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1178{
1179 return vreinterpretq_m128_s32(
1180 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1181}
1182
1183// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1184// elements in a and then AND with b, and store the results in dst.
1185// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1186FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1187{
1188 return vreinterpretq_m128_s32(
1189 vbicq_s32(vreinterpretq_s32_m128(b),
1190 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1191}
1192
1193// Average packed unsigned 16-bit integers in a and b, and store the results in
1194// dst.
1195// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1196FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1197{
1198 return vreinterpret_m64_u16(
1199 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1200}
1201
1202// Average packed unsigned 8-bit integers in a and b, and store the results in
1203// dst.
1204// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1205FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1206{
1207 return vreinterpret_m64_u8(
1208 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1209}
1210
1211// Compare packed single-precision (32-bit) floating-point elements in a and b
1212// for equality, and store the results in dst.
1213// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1214FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1215{
1216 return vreinterpretq_m128_u32(
1217 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1218}
1219
1220// Compare the lower single-precision (32-bit) floating-point elements in a and
1221// b for equality, store the result in the lower element of dst, and copy the
1222// upper 3 packed elements from a to the upper elements of dst.
1223// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1224FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1225{
1226 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1227}
1228
1229// Compare packed single-precision (32-bit) floating-point elements in a and b
1230// for greater-than-or-equal, and store the results in dst.
1231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1232FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1233{
1234 return vreinterpretq_m128_u32(
1235 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1236}
1237
1238// Compare the lower single-precision (32-bit) floating-point elements in a and
1239// b for greater-than-or-equal, store the result in the lower element of dst,
1240// and copy the upper 3 packed elements from a to the upper elements of dst.
1241// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1242FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1243{
1244 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1245}
1246
1247// Compare packed single-precision (32-bit) floating-point elements in a and b
1248// for greater-than, and store the results in dst.
1249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1250FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1251{
1252 return vreinterpretq_m128_u32(
1253 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1254}
1255
1256// Compare the lower single-precision (32-bit) floating-point elements in a and
1257// b for greater-than, store the result in the lower element of dst, and copy
1258// the upper 3 packed elements from a to the upper elements of dst.
1259// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1260FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1261{
1262 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1263}
1264
1265// Compare packed single-precision (32-bit) floating-point elements in a and b
1266// for less-than-or-equal, and store the results in dst.
1267// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1268FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1269{
1270 return vreinterpretq_m128_u32(
1271 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1272}
1273
1274// Compare the lower single-precision (32-bit) floating-point elements in a and
1275// b for less-than-or-equal, store the result in the lower element of dst, and
1276// copy the upper 3 packed elements from a to the upper elements of dst.
1277// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1278FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1279{
1280 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1281}
1282
1283// Compare packed single-precision (32-bit) floating-point elements in a and b
1284// for less-than, and store the results in dst.
1285// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1286FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1287{
1288 return vreinterpretq_m128_u32(
1289 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1290}
1291
1292// Compare the lower single-precision (32-bit) floating-point elements in a and
1293// b for less-than, store the result in the lower element of dst, and copy the
1294// upper 3 packed elements from a to the upper elements of dst.
1295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1296FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1297{
1298 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1299}
1300
1301// Compare packed single-precision (32-bit) floating-point elements in a and b
1302// for not-equal, and store the results in dst.
1303// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1304FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1305{
1306 return vreinterpretq_m128_u32(vmvnq_u32(
1307 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1308}
1309
1310// Compare the lower single-precision (32-bit) floating-point elements in a and
1311// b for not-equal, store the result in the lower element of dst, and copy the
1312// upper 3 packed elements from a to the upper elements of dst.
1313// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1314FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1315{
1316 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1317}
1318
1319// Compare packed single-precision (32-bit) floating-point elements in a and b
1320// for not-greater-than-or-equal, and store the results in dst.
1321// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1322FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1323{
1324 return vreinterpretq_m128_u32(vmvnq_u32(
1325 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1326}
1327
1328// Compare the lower single-precision (32-bit) floating-point elements in a and
1329// b for not-greater-than-or-equal, store the result in the lower element of
1330// dst, and copy the upper 3 packed elements from a to the upper elements of
1331// dst.
1332// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1333FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1334{
1335 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1336}
1337
1338// Compare packed single-precision (32-bit) floating-point elements in a and b
1339// for not-greater-than, and store the results in dst.
1340// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1341FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1342{
1343 return vreinterpretq_m128_u32(vmvnq_u32(
1344 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1345}
1346
1347// Compare the lower single-precision (32-bit) floating-point elements in a and
1348// b for not-greater-than, store the result in the lower element of dst, and
1349// copy the upper 3 packed elements from a to the upper elements of dst.
1350// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1351FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1352{
1353 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1354}
1355
1356// Compare packed single-precision (32-bit) floating-point elements in a and b
1357// for not-less-than-or-equal, and store the results in dst.
1358// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1359FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1360{
1361 return vreinterpretq_m128_u32(vmvnq_u32(
1362 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1363}
1364
1365// Compare the lower single-precision (32-bit) floating-point elements in a and
1366// b for not-less-than-or-equal, store the result in the lower element of dst,
1367// and copy the upper 3 packed elements from a to the upper elements of dst.
1368// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1369FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1370{
1371 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1372}
1373
1374// Compare packed single-precision (32-bit) floating-point elements in a and b
1375// for not-less-than, and store the results in dst.
1376// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1377FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1378{
1379 return vreinterpretq_m128_u32(vmvnq_u32(
1380 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1381}
1382
1383// Compare the lower single-precision (32-bit) floating-point elements in a and
1384// b for not-less-than, store the result in the lower element of dst, and copy
1385// the upper 3 packed elements from a to the upper elements of dst.
1386// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1387FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1388{
1389 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1390}
1391
1392// Compare packed single-precision (32-bit) floating-point elements in a and b
1393// to see if neither is NaN, and store the results in dst.
1394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1395//
1396// See also:
1397// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1398// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1399FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1400{
1401 // Note: NEON does not have ordered compare builtin
1402 // Need to compare a eq a and b eq b to check for NaN
1403 // Do AND of results to get final
1404 uint32x4_t ceqaa =
1405 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1406 uint32x4_t ceqbb =
1407 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1408 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1409}
1410
1411// Compare the lower single-precision (32-bit) floating-point elements in a and
1412// b to see if neither is NaN, store the result in the lower element of dst, and
1413// copy the upper 3 packed elements from a to the upper elements of dst.
1414// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1415FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1416{
1417 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1418}
1419
1420// Compare packed single-precision (32-bit) floating-point elements in a and b
1421// to see if either is NaN, and store the results in dst.
1422// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1423FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1424{
1425 uint32x4_t f32a =
1426 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1427 uint32x4_t f32b =
1428 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1429 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1430}
1431
1432// Compare the lower single-precision (32-bit) floating-point elements in a and
1433// b to see if either is NaN, store the result in the lower element of dst, and
1434// copy the upper 3 packed elements from a to the upper elements of dst.
1435// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1436FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1437{
1438 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1439}
1440
1441// Compare the lower single-precision (32-bit) floating-point element in a and b
1442// for equality, and return the boolean result (0 or 1).
1443// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1444FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1445{
1446 uint32x4_t a_eq_b =
1447 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1448 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1449}
1450
1451// Compare the lower single-precision (32-bit) floating-point element in a and b
1452// for greater-than-or-equal, and return the boolean result (0 or 1).
1453// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1454FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1455{
1456 uint32x4_t a_ge_b =
1457 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1458 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1459}
1460
1461// Compare the lower single-precision (32-bit) floating-point element in a and b
1462// for greater-than, and return the boolean result (0 or 1).
1463// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1464FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1465{
1466 uint32x4_t a_gt_b =
1467 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1468 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1469}
1470
1471// Compare the lower single-precision (32-bit) floating-point element in a and b
1472// for less-than-or-equal, and return the boolean result (0 or 1).
1473// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1474FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1475{
1476 uint32x4_t a_le_b =
1477 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1478 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1479}
1480
1481// Compare the lower single-precision (32-bit) floating-point element in a and b
1482// for less-than, and return the boolean result (0 or 1).
1483// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1484FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1485{
1486 uint32x4_t a_lt_b =
1487 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1488 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1489}
1490
1491// Compare the lower single-precision (32-bit) floating-point element in a and b
1492// for not-equal, and return the boolean result (0 or 1).
1493// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1494FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1495{
1496 return !_mm_comieq_ss(a, b);
1497}
1498
1499// Convert packed signed 32-bit integers in b to packed single-precision
1500// (32-bit) floating-point elements, store the results in the lower 2 elements
1501// of dst, and copy the upper 2 packed elements from a to the upper elements of
1502// dst.
1503// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1504FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1505{
1506 return vreinterpretq_m128_f32(
1507 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1508 vget_high_f32(vreinterpretq_f32_m128(a))));
1509}
1510
1511// Convert packed single-precision (32-bit) floating-point elements in a to
1512// packed 32-bit integers, and store the results in dst.
1513// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1514FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1515{
1516#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1517 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1518 return vreinterpret_m64_s32(
1519 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1520#else
1521 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1522 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1523#endif
1524}
1525
1526// Convert the signed 32-bit integer b to a single-precision (32-bit)
1527// floating-point element, store the result in the lower element of dst, and
1528// copy the upper 3 packed elements from a to the upper elements of dst.
1529// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1530FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1531{
1532 return vreinterpretq_m128_f32(
1533 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1534}
1535
1536// Convert the lower single-precision (32-bit) floating-point element in a to a
1537// 32-bit integer, and store the result in dst.
1538// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1539FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1540{
1541#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1542 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1543 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1544 0);
1545#else
1546 float32_t data = vgetq_lane_f32(
1547 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1548 return (int32_t) data;
1549#endif
1550}
1551
1552// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1553// floating-point elements, and store the results in dst.
1554// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1555FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1556{
1557 return vreinterpretq_m128_f32(
1558 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1559}
1560
1561// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1562// floating-point elements, store the results in the lower 2 elements of dst,
1563// and copy the upper 2 packed elements from a to the upper elements of dst.
1564// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1565FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1566{
1567 return vreinterpretq_m128_f32(
1568 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1569 vget_high_f32(vreinterpretq_f32_m128(a))));
1570}
1571
1572// Convert packed signed 32-bit integers in a to packed single-precision
1573// (32-bit) floating-point elements, store the results in the lower 2 elements
1574// of dst, then convert the packed signed 32-bit integers in b to
1575// single-precision (32-bit) floating-point element, and store the results in
1576// the upper 2 elements of dst.
1577// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1578FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1579{
1580 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1581 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1582}
1583
1584// Convert the lower packed 8-bit integers in a to packed single-precision
1585// (32-bit) floating-point elements, and store the results in dst.
1586// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1587FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1588{
1589 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1590 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1591}
1592
1593// Convert packed single-precision (32-bit) floating-point elements in a to
1594// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1595// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1596// 0x7FFFFFFF.
1597// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1598FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1599{
1600 return vreinterpret_m64_s16(
1601 vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1602}
1603
1604// Convert packed single-precision (32-bit) floating-point elements in a to
1605// packed 32-bit integers, and store the results in dst.
1606// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1607#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1608
1609// Convert packed single-precision (32-bit) floating-point elements in a to
1610// packed 8-bit integers, and store the results in lower 4 elements of dst.
1611// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1612// between 0x7F and 0x7FFFFFFF.
1613// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1614FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1615{
1616 return vreinterpret_m64_s8(vqmovn_s16(
1617 vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1618}
1619
1620// Convert packed unsigned 16-bit integers in a to packed single-precision
1621// (32-bit) floating-point elements, and store the results in dst.
1622// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1623FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1624{
1625 return vreinterpretq_m128_f32(
1626 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1627}
1628
1629// Convert the lower packed unsigned 8-bit integers in a to packed
1630// single-precision (32-bit) floating-point elements, and store the results in
1631// dst.
1632// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1633FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1634{
1635 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1636 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1637}
1638
1639// Convert the signed 32-bit integer b to a single-precision (32-bit)
1640// floating-point element, store the result in the lower element of dst, and
1641// copy the upper 3 packed elements from a to the upper elements of dst.
1642// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1643#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1644
1645// Convert the signed 64-bit integer b to a single-precision (32-bit)
1646// floating-point element, store the result in the lower element of dst, and
1647// copy the upper 3 packed elements from a to the upper elements of dst.
1648// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1649FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1650{
1651 return vreinterpretq_m128_f32(
1652 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1653}
1654
1655// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1656// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1657FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1658{
1659 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1660}
1661
1662// Convert the lower single-precision (32-bit) floating-point element in a to a
1663// 32-bit integer, and store the result in dst.
1664// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1665#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1666
1667// Convert the lower single-precision (32-bit) floating-point element in a to a
1668// 64-bit integer, and store the result in dst.
1669// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1670FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1671{
1672#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1673 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1674 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1675#else
1676 float32_t data = vgetq_lane_f32(
1677 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1678 return (int64_t) data;
1679#endif
1680}
1681
1682// Convert packed single-precision (32-bit) floating-point elements in a to
1683// packed 32-bit integers with truncation, and store the results in dst.
1684// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1685FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1686{
1687 return vreinterpret_m64_s32(
1688 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1689}
1690
1691// Convert the lower single-precision (32-bit) floating-point element in a to a
1692// 32-bit integer with truncation, and store the result in dst.
1693// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1694FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1695{
1696 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1697}
1698
1699// Convert packed single-precision (32-bit) floating-point elements in a to
1700// packed 32-bit integers with truncation, and store the results in dst.
1701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1702#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1703
1704// Convert the lower single-precision (32-bit) floating-point element in a to a
1705// 32-bit integer with truncation, and store the result in dst.
1706// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1707#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1708
1709// Convert the lower single-precision (32-bit) floating-point element in a to a
1710// 64-bit integer with truncation, and store the result in dst.
1711// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1712FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1713{
1714 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1715}
1716
1717// Divide packed single-precision (32-bit) floating-point elements in a by
1718// packed elements in b, and store the results in dst.
1719// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1720// division by multiplying a by b's reciprocal before using the Newton-Raphson
1721// method to approximate the results.
1722// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1723FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1724{
1725#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV
1726 return vreinterpretq_m128_f32(
1727 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1728#else
1729 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1730 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1731 // Additional Netwon-Raphson iteration for accuracy
1732 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1733 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1734#endif
1735}
1736
1737// Divide the lower single-precision (32-bit) floating-point element in a by the
1738// lower single-precision (32-bit) floating-point element in b, store the result
1739// in the lower element of dst, and copy the upper 3 packed elements from a to
1740// the upper elements of dst.
1741// Warning: ARMv7-A does not produce the same result compared to Intel and not
1742// IEEE-compliant.
1743// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1744FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1745{
1746 float32_t value =
1747 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1748 return vreinterpretq_m128_f32(
1749 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1750}
1751
1752// Extract a 16-bit integer from a, selected with imm8, and store the result in
1753// the lower element of dst.
1754// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1755#define _mm_extract_pi16(a, imm) \
1756 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1757
1758// Free aligned memory that was allocated with _mm_malloc.
1759// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1760#if !defined(SSE2NEON_ALLOC_DEFINED)
1761FORCE_INLINE void _mm_free(void *addr)
1762{
1763 free(addr);
1764}
1765#endif
1766
1767FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1768{
1769 uint64_t value;
1770#if defined(_MSC_VER)
1771 value = _ReadStatusReg(ARM64_FPCR);
1772#else
1773 __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1774#endif
1775 return value;
1776}
1777
1778FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1779{
1780#if defined(_MSC_VER)
1781 _WriteStatusReg(ARM64_FPCR, value);
1782#else
1783 __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
1784#endif
1785}
1786
1787// Macro: Get the flush zero bits from the MXCSR control and status register.
1788// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1789// _MM_FLUSH_ZERO_OFF
1790// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1791FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1792{
1793 union {
1794 fpcr_bitfield field;
1795#if defined(__aarch64__) || defined(_M_ARM64)
1796 uint64_t value;
1797#else
1798 uint32_t value;
1799#endif
1800 } r;
1801
1802#if defined(__aarch64__) || defined(_M_ARM64)
1803 r.value = _sse2neon_get_fpcr();
1804#else
1805 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1806#endif
1807
1808 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1809}
1810
1811// Macro: Get the rounding mode bits from the MXCSR control and status register.
1812// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1813// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1814// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1815FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1816{
1817 union {
1818 fpcr_bitfield field;
1819#if defined(__aarch64__) || defined(_M_ARM64)
1820 uint64_t value;
1821#else
1822 uint32_t value;
1823#endif
1824 } r;
1825
1826#if defined(__aarch64__) || defined(_M_ARM64)
1827 r.value = _sse2neon_get_fpcr();
1828#else
1829 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1830#endif
1831
1832 if (r.field.bit22) {
1833 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1834 } else {
1835 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1836 }
1837}
1838
1839// Copy a to dst, and insert the 16-bit integer i into dst at the location
1840// specified by imm8.
1841// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1842#define _mm_insert_pi16(a, b, imm) \
1843 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1844
1845// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1846// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1847// boundary or a general-protection exception may be generated.
1848// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1849FORCE_INLINE __m128 _mm_load_ps(const float *p)
1850{
1851 return vreinterpretq_m128_f32(vld1q_f32(p));
1852}
1853
1854// Load a single-precision (32-bit) floating-point element from memory into all
1855// elements of dst.
1856//
1857// dst[31:0] := MEM[mem_addr+31:mem_addr]
1858// dst[63:32] := MEM[mem_addr+31:mem_addr]
1859// dst[95:64] := MEM[mem_addr+31:mem_addr]
1860// dst[127:96] := MEM[mem_addr+31:mem_addr]
1861//
1862// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1863#define _mm_load_ps1 _mm_load1_ps
1864
1865// Load a single-precision (32-bit) floating-point element from memory into the
1866// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1867// aligned on any particular boundary.
1868// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1869FORCE_INLINE __m128 _mm_load_ss(const float *p)
1870{
1871 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1872}
1873
1874// Load a single-precision (32-bit) floating-point element from memory into all
1875// elements of dst.
1876// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1877FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1878{
1879 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1880}
1881
1882// Load 2 single-precision (32-bit) floating-point elements from memory into the
1883// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1884// mem_addr does not need to be aligned on any particular boundary.
1885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1886FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1887{
1888 return vreinterpretq_m128_f32(
1889 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1890}
1891
1892// Load 2 single-precision (32-bit) floating-point elements from memory into the
1893// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1894// mem_addr does not need to be aligned on any particular boundary.
1895// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1896FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1897{
1898 return vreinterpretq_m128_f32(
1899 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1900}
1901
1902// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1903// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1904// general-protection exception may be generated.
1905// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1906FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1907{
1908 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1909 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1910}
1911
1912// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1913// elements) from memory into dst. mem_addr does not need to be aligned on any
1914// particular boundary.
1915// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1916FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1917{
1918 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1919 // equivalent for neon
1920 return vreinterpretq_m128_f32(vld1q_f32(p));
1921}
1922
1923// Load unaligned 16-bit integer from memory into the first element of dst.
1924// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1925FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1926{
1927 return vreinterpretq_m128i_s16(
1928 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1929}
1930
1931// Load unaligned 64-bit integer from memory into the first element of dst.
1932// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1933FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1934{
1935 return vreinterpretq_m128i_s64(
1936 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1937}
1938
1939// Allocate size bytes of memory, aligned to the alignment specified in align,
1940// and return a pointer to the allocated memory. _mm_free should be used to free
1941// memory that is allocated with _mm_malloc.
1942// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1943#if !defined(SSE2NEON_ALLOC_DEFINED)
1944FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1945{
1946 void *ptr;
1947 if (align == 1)
1948 return malloc(size);
1949 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1950 align = sizeof(void *);
1951 if (!posix_memalign(&ptr, align, size))
1952 return ptr;
1953 return NULL;
1954}
1955#endif
1956
1957// Conditionally store 8-bit integer elements from a into memory using mask
1958// (elements are not stored when the highest bit is not set in the corresponding
1959// element) and a non-temporal memory hint.
1960// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
1961FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1962{
1963 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1964 __m128 b = _mm_load_ps((const float *) mem_addr);
1965 int8x8_t masked =
1966 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1967 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1968 vst1_s8((int8_t *) mem_addr, masked);
1969}
1970
1971// Conditionally store 8-bit integer elements from a into memory using mask
1972// (elements are not stored when the highest bit is not set in the corresponding
1973// element) and a non-temporal memory hint.
1974// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
1975#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1976
1977// Compare packed signed 16-bit integers in a and b, and store packed maximum
1978// values in dst.
1979// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
1980FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1981{
1982 return vreinterpret_m64_s16(
1983 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1984}
1985
1986// Compare packed single-precision (32-bit) floating-point elements in a and b,
1987// and store packed maximum values in dst. dst does not follow the IEEE Standard
1988// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
1989// signed-zero values.
1990// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
1991FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1992{
1993#if SSE2NEON_PRECISE_MINMAX
1994 float32x4_t _a = vreinterpretq_f32_m128(a);
1995 float32x4_t _b = vreinterpretq_f32_m128(b);
1996 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1997#else
1998 return vreinterpretq_m128_f32(
1999 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2000#endif
2001}
2002
2003// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2004// values in dst.
2005// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2006FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2007{
2008 return vreinterpret_m64_u8(
2009 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2010}
2011
2012// Compare the lower single-precision (32-bit) floating-point elements in a and
2013// b, store the maximum value in the lower element of dst, and copy the upper 3
2014// packed elements from a to the upper element of dst. dst does not follow the
2015// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2016// inputs are NaN or signed-zero values.
2017// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2018FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2019{
2020 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2021 return vreinterpretq_m128_f32(
2022 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2023}
2024
2025// Compare packed signed 16-bit integers in a and b, and store packed minimum
2026// values in dst.
2027// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2028FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2029{
2030 return vreinterpret_m64_s16(
2031 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2032}
2033
2034// Compare packed single-precision (32-bit) floating-point elements in a and b,
2035// and store packed minimum values in dst. dst does not follow the IEEE Standard
2036// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2037// signed-zero values.
2038// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2039FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2040{
2041#if SSE2NEON_PRECISE_MINMAX
2042 float32x4_t _a = vreinterpretq_f32_m128(a);
2043 float32x4_t _b = vreinterpretq_f32_m128(b);
2044 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2045#else
2046 return vreinterpretq_m128_f32(
2047 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2048#endif
2049}
2050
2051// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2052// values in dst.
2053// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2054FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2055{
2056 return vreinterpret_m64_u8(
2057 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2058}
2059
2060// Compare the lower single-precision (32-bit) floating-point elements in a and
2061// b, store the minimum value in the lower element of dst, and copy the upper 3
2062// packed elements from a to the upper element of dst. dst does not follow the
2063// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2064// inputs are NaN or signed-zero values.
2065// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2066FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2067{
2068 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2069 return vreinterpretq_m128_f32(
2070 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2071}
2072
2073// Move the lower single-precision (32-bit) floating-point element from b to the
2074// lower element of dst, and copy the upper 3 packed elements from a to the
2075// upper elements of dst.
2076// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2077FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2078{
2079 return vreinterpretq_m128_f32(
2080 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2081 vreinterpretq_f32_m128(a), 0));
2082}
2083
2084// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2085// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2086// upper 2 elements of dst.
2087// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2088FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
2089{
2090#if defined(aarch64__)
2091 return vreinterpretq_m128_u64(
2092 vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
2093#else
2094 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2095 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2096 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2097#endif
2098}
2099
2100// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2101// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2102// lower 2 elements of dst.
2103// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2104FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2105{
2106 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2107 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2108 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2109}
2110
2111// Create mask from the most significant bit of each 8-bit element in a, and
2112// store the result in dst.
2113// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2114FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2115{
2116 uint8x8_t input = vreinterpret_u8_m64(a);
2117#if defined(__aarch64__) || defined(_M_ARM64)
2118 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2119 uint8x8_t tmp = vshr_n_u8(input, 7);
2120 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2121#else
2122 // Refer the implementation of `_mm_movemask_epi8`
2123 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2124 uint32x2_t paired16 =
2125 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2126 uint8x8_t paired32 =
2127 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2128 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2129#endif
2130}
2131
2132// Set each bit of mask dst based on the most significant bit of the
2133// corresponding packed single-precision (32-bit) floating-point element in a.
2134// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2135FORCE_INLINE int _mm_movemask_ps(__m128 a)
2136{
2137 uint32x4_t input = vreinterpretq_u32_m128(a);
2138#if defined(__aarch64__) || defined(_M_ARM64)
2139 static const int32_t shift[4] = {0, 1, 2, 3};
2140 uint32x4_t tmp = vshrq_n_u32(input, 31);
2141 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2142#else
2143 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2144 // Shift out everything but the sign bits with a 32-bit unsigned shift
2145 // right.
2146 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2147 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2148 uint8x16_t paired =
2149 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2150 // Extract the result.
2151 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2152#endif
2153}
2154
2155// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2156// and store the results in dst.
2157// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2158FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2159{
2160 return vreinterpretq_m128_f32(
2161 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2162}
2163
2164// Multiply the lower single-precision (32-bit) floating-point element in a and
2165// b, store the result in the lower element of dst, and copy the upper 3 packed
2166// elements from a to the upper elements of dst.
2167// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2168FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2169{
2170 return _mm_move_ss(a, _mm_mul_ps(a, b));
2171}
2172
2173// Multiply the packed unsigned 16-bit integers in a and b, producing
2174// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2175// integers in dst.
2176// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2177FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2178{
2179 return vreinterpret_m64_u16(vshrn_n_u32(
2180 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2181}
2182
2183// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2184// elements in a and b, and store the results in dst.
2185// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2186FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2187{
2188 return vreinterpretq_m128_s32(
2189 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2190}
2191
2192// Average packed unsigned 8-bit integers in a and b, and store the results in
2193// dst.
2194// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2195#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2196
2197// Average packed unsigned 16-bit integers in a and b, and store the results in
2198// dst.
2199// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2200#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2201
2202// Extract a 16-bit integer from a, selected with imm8, and store the result in
2203// the lower element of dst.
2204// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2205#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2206
2207// Copy a to dst, and insert the 16-bit integer i into dst at the location
2208// specified by imm8.
2209// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2210#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2211
2212// Compare packed signed 16-bit integers in a and b, and store packed maximum
2213// values in dst.
2214// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2215#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2216
2217// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2218// values in dst.
2219// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2220#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2221
2222// Compare packed signed 16-bit integers in a and b, and store packed minimum
2223// values in dst.
2224// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2225#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2226
2227// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2228// values in dst.
2229// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2230#define _m_pminub(a, b) _mm_min_pu8(a, b)
2231
2232// Create mask from the most significant bit of each 8-bit element in a, and
2233// store the result in dst.
2234// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2235#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2236
2237// Multiply the packed unsigned 16-bit integers in a and b, producing
2238// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2239// integers in dst.
2240// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2241#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2242
2243// Fetch the line of data from memory that contains address p to a location in
2244// the cache hierarchy specified by the locality hint i.
2245// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2246FORCE_INLINE void _mm_prefetch(char const *p, int i)
2247{
2248 (void) i;
2249#if defined(_MSC_VER)
2250 switch (i) {
2251 case _MM_HINT_NTA:
2252 __prefetch2(p, 1);
2253 break;
2254 case _MM_HINT_T0:
2255 __prefetch2(p, 0);
2256 break;
2257 case _MM_HINT_T1:
2258 __prefetch2(p, 2);
2259 break;
2260 case _MM_HINT_T2:
2261 __prefetch2(p, 4);
2262 break;
2263 }
2264#else
2265 switch (i) {
2266 case _MM_HINT_NTA:
2267 __builtin_prefetch(p, 0, 0);
2268 break;
2269 case _MM_HINT_T0:
2270 __builtin_prefetch(p, 0, 3);
2271 break;
2272 case _MM_HINT_T1:
2273 __builtin_prefetch(p, 0, 2);
2274 break;
2275 case _MM_HINT_T2:
2276 __builtin_prefetch(p, 0, 1);
2277 break;
2278 }
2279#endif
2280}
2281
2282// Compute the absolute differences of packed unsigned 8-bit integers in a and
2283// b, then horizontally sum each consecutive 8 differences to produce four
2284// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2285// 16 bits of dst.
2286// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2287#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2288
2289// Shuffle 16-bit integers in a using the control in imm8, and store the results
2290// in dst.
2291// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2292#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2293
2294// Compute the approximate reciprocal of packed single-precision (32-bit)
2295// floating-point elements in a, and store the results in dst. The maximum
2296// relative error for this approximation is less than 1.5*2^-12.
2297// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2298FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2299{
2300 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2301 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2302#if SSE2NEON_PRECISE_DIV
2303 // Additional Netwon-Raphson iteration for accuracy
2304 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2305#endif
2306 return vreinterpretq_m128_f32(recip);
2307}
2308
2309// Compute the approximate reciprocal of the lower single-precision (32-bit)
2310// floating-point element in a, store the result in the lower element of dst,
2311// and copy the upper 3 packed elements from a to the upper elements of dst. The
2312// maximum relative error for this approximation is less than 1.5*2^-12.
2313// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2314FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2315{
2316 return _mm_move_ss(a, _mm_rcp_ps(a));
2317}
2318
2319// Compute the approximate reciprocal square root of packed single-precision
2320// (32-bit) floating-point elements in a, and store the results in dst. The
2321// maximum relative error for this approximation is less than 1.5*2^-12.
2322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2323FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2324{
2325 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2326
2327 // Generate masks for detecting whether input has any 0.0f/-0.0f
2328 // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2329 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2330 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2331 const uint32x4_t has_pos_zero =
2332 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2333 const uint32x4_t has_neg_zero =
2334 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2335
2336 out = vmulq_f32(
2337 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2338#if SSE2NEON_PRECISE_SQRT
2339 // Additional Netwon-Raphson iteration for accuracy
2340 out = vmulq_f32(
2341 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2342#endif
2343
2344 // Set output vector element to infinity/negative-infinity if
2345 // the corresponding input vector element is 0.0f/-0.0f.
2346 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2347 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2348
2349 return vreinterpretq_m128_f32(out);
2350}
2351
2352// Compute the approximate reciprocal square root of the lower single-precision
2353// (32-bit) floating-point element in a, store the result in the lower element
2354// of dst, and copy the upper 3 packed elements from a to the upper elements of
2355// dst.
2356// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2357FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2358{
2359 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2360}
2361
2362// Compute the absolute differences of packed unsigned 8-bit integers in a and
2363// b, then horizontally sum each consecutive 8 differences to produce four
2364// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2365// 16 bits of dst.
2366// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2367FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2368{
2369 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2370 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2371 return vreinterpret_m64_u16(
2372 vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2373}
2374
2375// Macro: Set the flush zero bits of the MXCSR control and status register to
2376// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2377// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2378// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2379FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2380{
2381 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2382 // regardless of the value of the FZ bit.
2383 union {
2384 fpcr_bitfield field;
2385#if defined(__aarch64__) || defined(_M_ARM64)
2386 uint64_t value;
2387#else
2388 uint32_t value;
2389#endif
2390 } r;
2391
2392#if defined(__aarch64__) || defined(_M_ARM64)
2393 r.value = _sse2neon_get_fpcr();
2394#else
2395 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2396#endif
2397
2398 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2399
2400#if defined(__aarch64__) || defined(_M_ARM64)
2401 _sse2neon_set_fpcr(r.value);
2402#else
2403 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2404#endif
2405}
2406
2407// Set packed single-precision (32-bit) floating-point elements in dst with the
2408// supplied values.
2409// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2410FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2411{
2412 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2413 return vreinterpretq_m128_f32(vld1q_f32(data));
2414}
2415
2416// Broadcast single-precision (32-bit) floating-point value a to all elements of
2417// dst.
2418// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2419FORCE_INLINE __m128 _mm_set_ps1(float _w)
2420{
2421 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2422}
2423
2424// Macro: Set the rounding mode bits of the MXCSR control and status register to
2425// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2426// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2427// _MM_ROUND_TOWARD_ZERO
2428// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2429FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2430{
2431 union {
2432 fpcr_bitfield field;
2433#if defined(__aarch64__) || defined(_M_ARM64)
2434 uint64_t value;
2435#else
2436 uint32_t value;
2437#endif
2438 } r;
2439
2440#if defined(__aarch64__) || defined(_M_ARM64)
2441 r.value = _sse2neon_get_fpcr();
2442#else
2443 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2444#endif
2445
2446 switch (rounding) {
2447 case _MM_ROUND_TOWARD_ZERO:
2448 r.field.bit22 = 1;
2449 r.field.bit23 = 1;
2450 break;
2451 case _MM_ROUND_DOWN:
2452 r.field.bit22 = 0;
2453 r.field.bit23 = 1;
2454 break;
2455 case _MM_ROUND_UP:
2456 r.field.bit22 = 1;
2457 r.field.bit23 = 0;
2458 break;
2459 default: //_MM_ROUND_NEAREST
2460 r.field.bit22 = 0;
2461 r.field.bit23 = 0;
2462 }
2463
2464#if defined(__aarch64__) || defined(_M_ARM64)
2465 _sse2neon_set_fpcr(r.value);
2466#else
2467 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2468#endif
2469}
2470
2471// Copy single-precision (32-bit) floating-point element a to the lower element
2472// of dst, and zero the upper 3 elements.
2473// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2474FORCE_INLINE __m128 _mm_set_ss(float a)
2475{
2476 return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2477}
2478
2479// Broadcast single-precision (32-bit) floating-point value a to all elements of
2480// dst.
2481// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2482FORCE_INLINE __m128 _mm_set1_ps(float _w)
2483{
2484 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2485}
2486
2487// Set the MXCSR control and status register with the value in unsigned 32-bit
2488// integer a.
2489// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2490// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2491FORCE_INLINE void _mm_setcsr(unsigned int a)
2492{
2493 _MM_SET_ROUNDING_MODE(a);
2494}
2495
2496// Get the unsigned 32-bit value of the MXCSR control and status register.
2497// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2498// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2499FORCE_INLINE unsigned int _mm_getcsr(void)
2500{
2501 return _MM_GET_ROUNDING_MODE();
2502}
2503
2504// Set packed single-precision (32-bit) floating-point elements in dst with the
2505// supplied values in reverse order.
2506// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2507FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2508{
2509 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2510 return vreinterpretq_m128_f32(vld1q_f32(data));
2511}
2512
2513// Return vector of type __m128 with all elements set to zero.
2514// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2515FORCE_INLINE __m128 _mm_setzero_ps(void)
2516{
2517 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2518}
2519
2520// Shuffle 16-bit integers in a using the control in imm8, and store the results
2521// in dst.
2522// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2523#ifdef _sse2neon_shuffle
2524#define _mm_shuffle_pi16(a, imm) \
2525 vreinterpret_m64_s16(vshuffle_s16( \
2526 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2527 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2528#else
2529#define _mm_shuffle_pi16(a, imm) \
2530 _sse2neon_define1( \
2531 __m64, a, int16x4_t ret; \
2532 ret = vmov_n_s16( \
2533 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2534 ret = vset_lane_s16( \
2535 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2536 1); \
2537 ret = vset_lane_s16( \
2538 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2539 2); \
2540 ret = vset_lane_s16( \
2541 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2542 3); \
2543 _sse2neon_return(vreinterpret_m64_s16(ret));)
2544#endif
2545
2546// Perform a serializing operation on all store-to-memory instructions that were
2547// issued prior to this instruction. Guarantees that every store instruction
2548// that precedes, in program order, is globally visible before any store
2549// instruction which follows the fence in program order.
2550// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2551FORCE_INLINE void _mm_sfence(void)
2552{
2553 _sse2neon_smp_mb();
2554}
2555
2556// Perform a serializing operation on all load-from-memory and store-to-memory
2557// instructions that were issued prior to this instruction. Guarantees that
2558// every memory access that precedes, in program order, the memory fence
2559// instruction is globally visible before any memory instruction which follows
2560// the fence in program order.
2561// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2562FORCE_INLINE void _mm_mfence(void)
2563{
2564 _sse2neon_smp_mb();
2565}
2566
2567// Perform a serializing operation on all load-from-memory instructions that
2568// were issued prior to this instruction. Guarantees that every load instruction
2569// that precedes, in program order, is globally visible before any load
2570// instruction which follows the fence in program order.
2571// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2572FORCE_INLINE void _mm_lfence(void)
2573{
2574 _sse2neon_smp_mb();
2575}
2576
2577// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2578// int imm)
2579#ifdef _sse2neon_shuffle
2580#define _mm_shuffle_ps(a, b, imm) \
2581 __extension__({ \
2582 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2583 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2584 float32x4_t _shuf = \
2585 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2586 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2587 vreinterpretq_m128_f32(_shuf); \
2588 })
2589#else // generic
2590#define _mm_shuffle_ps(a, b, imm) \
2591 _sse2neon_define2( \
2592 __m128, a, b, __m128 ret; switch (imm) { \
2593 case _MM_SHUFFLE(1, 0, 3, 2): \
2594 ret = _mm_shuffle_ps_1032(_a, _b); \
2595 break; \
2596 case _MM_SHUFFLE(2, 3, 0, 1): \
2597 ret = _mm_shuffle_ps_2301(_a, _b); \
2598 break; \
2599 case _MM_SHUFFLE(0, 3, 2, 1): \
2600 ret = _mm_shuffle_ps_0321(_a, _b); \
2601 break; \
2602 case _MM_SHUFFLE(2, 1, 0, 3): \
2603 ret = _mm_shuffle_ps_2103(_a, _b); \
2604 break; \
2605 case _MM_SHUFFLE(1, 0, 1, 0): \
2606 ret = _mm_movelh_ps(_a, _b); \
2607 break; \
2608 case _MM_SHUFFLE(1, 0, 0, 1): \
2609 ret = _mm_shuffle_ps_1001(_a, _b); \
2610 break; \
2611 case _MM_SHUFFLE(0, 1, 0, 1): \
2612 ret = _mm_shuffle_ps_0101(_a, _b); \
2613 break; \
2614 case _MM_SHUFFLE(3, 2, 1, 0): \
2615 ret = _mm_shuffle_ps_3210(_a, _b); \
2616 break; \
2617 case _MM_SHUFFLE(0, 0, 1, 1): \
2618 ret = _mm_shuffle_ps_0011(_a, _b); \
2619 break; \
2620 case _MM_SHUFFLE(0, 0, 2, 2): \
2621 ret = _mm_shuffle_ps_0022(_a, _b); \
2622 break; \
2623 case _MM_SHUFFLE(2, 2, 0, 0): \
2624 ret = _mm_shuffle_ps_2200(_a, _b); \
2625 break; \
2626 case _MM_SHUFFLE(3, 2, 0, 2): \
2627 ret = _mm_shuffle_ps_3202(_a, _b); \
2628 break; \
2629 case _MM_SHUFFLE(3, 2, 3, 2): \
2630 ret = _mm_movehl_ps(_b, _a); \
2631 break; \
2632 case _MM_SHUFFLE(1, 1, 3, 3): \
2633 ret = _mm_shuffle_ps_1133(_a, _b); \
2634 break; \
2635 case _MM_SHUFFLE(2, 0, 1, 0): \
2636 ret = _mm_shuffle_ps_2010(_a, _b); \
2637 break; \
2638 case _MM_SHUFFLE(2, 0, 0, 1): \
2639 ret = _mm_shuffle_ps_2001(_a, _b); \
2640 break; \
2641 case _MM_SHUFFLE(2, 0, 3, 2): \
2642 ret = _mm_shuffle_ps_2032(_a, _b); \
2643 break; \
2644 default: \
2645 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2646 break; \
2647 } _sse2neon_return(ret);)
2648#endif
2649
2650// Compute the square root of packed single-precision (32-bit) floating-point
2651// elements in a, and store the results in dst.
2652// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2653// square root by multiplying input in with its reciprocal square root before
2654// using the Newton-Raphson method to approximate the results.
2655// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2656FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2657{
2658#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
2659 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2660#else
2661 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2662
2663 // Test for vrsqrteq_f32(0) -> positive infinity case.
2664 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2665 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2666 const uint32x4_t div_by_zero =
2667 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2668 recip = vreinterpretq_f32_u32(
2669 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2670
2671 recip = vmulq_f32(
2672 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2673 recip);
2674 // Additional Netwon-Raphson iteration for accuracy
2675 recip = vmulq_f32(
2676 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2677 recip);
2678
2679 // sqrt(s) = s * 1/sqrt(s)
2680 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2681#endif
2682}
2683
2684// Compute the square root of the lower single-precision (32-bit) floating-point
2685// element in a, store the result in the lower element of dst, and copy the
2686// upper 3 packed elements from a to the upper elements of dst.
2687// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2688FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2689{
2690 float32_t value =
2691 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2692 return vreinterpretq_m128_f32(
2693 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2694}
2695
2696// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2697// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2698// or a general-protection exception may be generated.
2699// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2700FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2701{
2702 vst1q_f32(p, vreinterpretq_f32_m128(a));
2703}
2704
2705// Store the lower single-precision (32-bit) floating-point element from a into
2706// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2707// boundary or a general-protection exception may be generated.
2708// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2709FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2710{
2711 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2712 vst1q_f32(p, vdupq_n_f32(a0));
2713}
2714
2715// Store the lower single-precision (32-bit) floating-point element from a into
2716// memory. mem_addr does not need to be aligned on any particular boundary.
2717// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2718FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2719{
2720 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2721}
2722
2723// Store the lower single-precision (32-bit) floating-point element from a into
2724// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2725// boundary or a general-protection exception may be generated.
2726// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2727#define _mm_store1_ps _mm_store_ps1
2728
2729// Store the upper 2 single-precision (32-bit) floating-point elements from a
2730// into memory.
2731// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2732FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2733{
2734 *p = vreinterpret_m64_f32(vget_high_f32(a));
2735}
2736
2737// Store the lower 2 single-precision (32-bit) floating-point elements from a
2738// into memory.
2739// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2740FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2741{
2742 *p = vreinterpret_m64_f32(vget_low_f32(a));
2743}
2744
2745// Store 4 single-precision (32-bit) floating-point elements from a into memory
2746// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2747// general-protection exception may be generated.
2748// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2749FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2750{
2751 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2752 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2753 vst1q_f32(p, rev);
2754}
2755
2756// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2757// elements) from a into memory. mem_addr does not need to be aligned on any
2758// particular boundary.
2759// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2760FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2761{
2762 vst1q_f32(p, vreinterpretq_f32_m128(a));
2763}
2764
2765// Stores 16-bits of integer data a at the address p.
2766// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2767FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2768{
2769 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2770}
2771
2772// Stores 64-bits of integer data a at the address p.
2773// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2774FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2775{
2776 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2777}
2778
2779// Store 64-bits of integer data from a into memory using a non-temporal memory
2780// hint.
2781// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2782FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2783{
2784 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2785}
2786
2787// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2788// point elements) from a into memory using a non-temporal memory hint.
2789// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2790FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2791{
2792#if __has_builtin(__builtin_nontemporal_store)
2793 __builtin_nontemporal_store(a, (float32x4_t *) p);
2794#else
2795 vst1q_f32(p, vreinterpretq_f32_m128(a));
2796#endif
2797}
2798
2799// Subtract packed single-precision (32-bit) floating-point elements in b from
2800// packed single-precision (32-bit) floating-point elements in a, and store the
2801// results in dst.
2802// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2803FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2804{
2805 return vreinterpretq_m128_f32(
2806 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2807}
2808
2809// Subtract the lower single-precision (32-bit) floating-point element in b from
2810// the lower single-precision (32-bit) floating-point element in a, store the
2811// result in the lower element of dst, and copy the upper 3 packed elements from
2812// a to the upper elements of dst.
2813// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2814FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2815{
2816 return _mm_move_ss(a, _mm_sub_ps(a, b));
2817}
2818
2819// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2820// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2821// transposed matrix in these vectors (row0 now contains column 0, etc.).
2822// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2823#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2824 do { \
2825 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2826 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2827 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2828 vget_low_f32(ROW23.val[0])); \
2829 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2830 vget_low_f32(ROW23.val[1])); \
2831 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2832 vget_high_f32(ROW23.val[0])); \
2833 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2834 vget_high_f32(ROW23.val[1])); \
2835 } while (0)
2836
2837// according to the documentation, these intrinsics behave the same as the
2838// non-'u' versions. We'll just alias them here.
2839#define _mm_ucomieq_ss _mm_comieq_ss
2840#define _mm_ucomige_ss _mm_comige_ss
2841#define _mm_ucomigt_ss _mm_comigt_ss
2842#define _mm_ucomile_ss _mm_comile_ss
2843#define _mm_ucomilt_ss _mm_comilt_ss
2844#define _mm_ucomineq_ss _mm_comineq_ss
2845
2846// Return vector of type __m128i with undefined elements.
2847// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2848FORCE_INLINE __m128i _mm_undefined_si128(void)
2849{
2850#if defined(__GNUC__) || defined(__clang__)
2851#pragma GCC diagnostic push
2852#pragma GCC diagnostic ignored "-Wuninitialized"
2853#endif
2854 __m128i a;
2855#if defined(_MSC_VER)
2856 a = _mm_setzero_si128();
2857#endif
2858 return a;
2859#if defined(__GNUC__) || defined(__clang__)
2860#pragma GCC diagnostic pop
2861#endif
2862}
2863
2864// Return vector of type __m128 with undefined elements.
2865// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2866FORCE_INLINE __m128 _mm_undefined_ps(void)
2867{
2868#if defined(__GNUC__) || defined(__clang__)
2869#pragma GCC diagnostic push
2870#pragma GCC diagnostic ignored "-Wuninitialized"
2871#endif
2872 __m128 a;
2873#if defined(_MSC_VER)
2874 a = _mm_setzero_ps();
2875#endif
2876 return a;
2877#if defined(__GNUC__) || defined(__clang__)
2878#pragma GCC diagnostic pop
2879#endif
2880}
2881
2882// Unpack and interleave single-precision (32-bit) floating-point elements from
2883// the high half a and b, and store the results in dst.
2884// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2885FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2886{
2887#if defined(__aarch64__) || defined(_M_ARM64)
2888 return vreinterpretq_m128_f32(
2889 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2890#else
2891 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2892 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2893 float32x2x2_t result = vzip_f32(a1, b1);
2894 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2895#endif
2896}
2897
2898// Unpack and interleave single-precision (32-bit) floating-point elements from
2899// the low half of a and b, and store the results in dst.
2900// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2901FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2902{
2903#if defined(__aarch64__) || defined(_M_ARM64)
2904 return vreinterpretq_m128_f32(
2905 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2906#else
2907 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2908 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2909 float32x2x2_t result = vzip_f32(a1, b1);
2910 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2911#endif
2912}
2913
2914// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2915// elements in a and b, and store the results in dst.
2916// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2917FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2918{
2919 return vreinterpretq_m128_s32(
2920 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2921}
2922
2923/* SSE2 */
2924
2925// Add packed 16-bit integers in a and b, and store the results in dst.
2926// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2927FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2928{
2929 return vreinterpretq_m128i_s16(
2930 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2931}
2932
2933// Add packed 32-bit integers in a and b, and store the results in dst.
2934// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2935FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2936{
2937 return vreinterpretq_m128i_s32(
2938 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2939}
2940
2941// Add packed 64-bit integers in a and b, and store the results in dst.
2942// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2943FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2944{
2945 return vreinterpretq_m128i_s64(
2946 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2947}
2948
2949// Add packed 8-bit integers in a and b, and store the results in dst.
2950// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2951FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2952{
2953 return vreinterpretq_m128i_s8(
2954 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2955}
2956
2957// Add packed double-precision (64-bit) floating-point elements in a and b, and
2958// store the results in dst.
2959// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2960FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2961{
2962#if defined(__aarch64__) || defined(_M_ARM64)
2963 return vreinterpretq_m128d_f64(
2964 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2965#else
2966 double *da = (double *) &a;
2967 double *db = (double *) &b;
2968 double c[2];
2969 c[0] = da[0] + db[0];
2970 c[1] = da[1] + db[1];
2971 return vld1q_f32((float32_t *) c);
2972#endif
2973}
2974
2975// Add the lower double-precision (64-bit) floating-point element in a and b,
2976// store the result in the lower element of dst, and copy the upper element from
2977// a to the upper element of dst.
2978// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
2979FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2980{
2981#if defined(__aarch64__) || defined(_M_ARM64)
2982 return _mm_move_sd(a, _mm_add_pd(a, b));
2983#else
2984 double *da = (double *) &a;
2985 double *db = (double *) &b;
2986 double c[2];
2987 c[0] = da[0] + db[0];
2988 c[1] = da[1];
2989 return vld1q_f32((float32_t *) c);
2990#endif
2991}
2992
2993// Add 64-bit integers a and b, and store the result in dst.
2994// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
2995FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2996{
2997 return vreinterpret_m64_s64(
2998 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2999}
3000
3001// Add packed signed 16-bit integers in a and b using saturation, and store the
3002// results in dst.
3003// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
3004FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3005{
3006 return vreinterpretq_m128i_s16(
3007 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3008}
3009
3010// Add packed signed 8-bit integers in a and b using saturation, and store the
3011// results in dst.
3012// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
3013FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3014{
3015 return vreinterpretq_m128i_s8(
3016 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3017}
3018
3019// Add packed unsigned 16-bit integers in a and b using saturation, and store
3020// the results in dst.
3021// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3022FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3023{
3024 return vreinterpretq_m128i_u16(
3025 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3026}
3027
3028// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3029// results in dst.
3030// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3031FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3032{
3033 return vreinterpretq_m128i_u8(
3034 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3035}
3036
3037// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3038// elements in a and b, and store the results in dst.
3039// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3040FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3041{
3042 return vreinterpretq_m128d_s64(
3043 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3044}
3045
3046// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3047// and store the result in dst.
3048// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3049FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3050{
3051 return vreinterpretq_m128i_s32(
3052 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3053}
3054
3055// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3056// elements in a and then AND with b, and store the results in dst.
3057// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3058FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3059{
3060 // *NOTE* argument swap
3061 return vreinterpretq_m128d_s64(
3062 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3063}
3064
3065// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3066// AND with b, and store the result in dst.
3067// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3068FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3069{
3070 return vreinterpretq_m128i_s32(
3071 vbicq_s32(vreinterpretq_s32_m128i(b),
3072 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3073}
3074
3075// Average packed unsigned 16-bit integers in a and b, and store the results in
3076// dst.
3077// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3078FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3079{
3080 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3081 vreinterpretq_u16_m128i(b));
3082}
3083
3084// Average packed unsigned 8-bit integers in a and b, and store the results in
3085// dst.
3086// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3087FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3088{
3089 return vreinterpretq_m128i_u8(
3090 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3091}
3092
3093// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3094// dst.
3095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3096#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3097
3098// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3099// dst.
3100// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3101#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3102
3103// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3104// compilation and does not generate any instructions, thus it has zero latency.
3105// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3106FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3107{
3108 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3109}
3110
3111// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3112// compilation and does not generate any instructions, thus it has zero latency.
3113// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3114FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3115{
3116 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3117}
3118
3119// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3120// compilation and does not generate any instructions, thus it has zero latency.
3121// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3122FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3123{
3124 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3125}
3126
3127// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3128// compilation and does not generate any instructions, thus it has zero latency.
3129// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3130FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3131{
3132 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3133}
3134
3135// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3136// compilation and does not generate any instructions, thus it has zero latency.
3137// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3138FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3139{
3140#if defined(__aarch64__) || defined(_M_ARM64)
3141 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3142#else
3143 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3144#endif
3145}
3146
3147// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3148// compilation and does not generate any instructions, thus it has zero latency.
3149// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3150FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3151{
3152 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3153}
3154
3155// Invalidate and flush the cache line that contains p from all levels of the
3156// cache hierarchy.
3157// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3158#if defined(__APPLE__)
3159#include <libkern/OSCacheControl.h>
3160#endif
3161FORCE_INLINE void _mm_clflush(void const *p)
3162{
3163 (void) p;
3164
3165 /* sys_icache_invalidate is supported since macOS 10.5.
3166 * However, it does not work on non-jailbroken iOS devices, although the
3167 * compilation is successful.
3168 */
3169#if defined(__APPLE__)
3170 sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3171#elif defined(__GNUC__) || defined(__clang__)
3172 uintptr_t ptr = (uintptr_t) p;
3173 __builtin___clear_cache((char *) ptr,
3174 (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3175#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3176 FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3177#endif
3178}
3179
3180// Compare packed 16-bit integers in a and b for equality, and store the results
3181// in dst.
3182// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3183FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3184{
3185 return vreinterpretq_m128i_u16(
3186 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3187}
3188
3189// Compare packed 32-bit integers in a and b for equality, and store the results
3190// in dst.
3191// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3192FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3193{
3194 return vreinterpretq_m128i_u32(
3195 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3196}
3197
3198// Compare packed 8-bit integers in a and b for equality, and store the results
3199// in dst.
3200// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3201FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3202{
3203 return vreinterpretq_m128i_u8(
3204 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3205}
3206
3207// Compare packed double-precision (64-bit) floating-point elements in a and b
3208// for equality, and store the results in dst.
3209// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3210FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3211{
3212#if defined(__aarch64__) || defined(_M_ARM64)
3213 return vreinterpretq_m128d_u64(
3214 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3215#else
3216 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3217 uint32x4_t cmp =
3218 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3219 uint32x4_t swapped = vrev64q_u32(cmp);
3220 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3221#endif
3222}
3223
3224// Compare the lower double-precision (64-bit) floating-point elements in a and
3225// b for equality, store the result in the lower element of dst, and copy the
3226// upper element from a to the upper element of dst.
3227// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3228FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3229{
3230 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3231}
3232
3233// Compare packed double-precision (64-bit) floating-point elements in a and b
3234// for greater-than-or-equal, and store the results in dst.
3235// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3236FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3237{
3238#if defined(__aarch64__) || defined(_M_ARM64)
3239 return vreinterpretq_m128d_u64(
3240 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3241#else
3242 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3243 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3244 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3245 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3246 uint64_t d[2];
3247 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3248 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3249
3250 return vreinterpretq_m128d_u64(vld1q_u64(d));
3251#endif
3252}
3253
3254// Compare the lower double-precision (64-bit) floating-point elements in a and
3255// b for greater-than-or-equal, store the result in the lower element of dst,
3256// and copy the upper element from a to the upper element of dst.
3257// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3258FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3259{
3260#if defined(__aarch64__) || defined(_M_ARM64)
3261 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3262#else
3263 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3264 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3265 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3266 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3267 uint64_t d[2];
3268 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3269 d[1] = a1;
3270
3271 return vreinterpretq_m128d_u64(vld1q_u64(d));
3272#endif
3273}
3274
3275// Compare packed signed 16-bit integers in a and b for greater-than, and store
3276// the results in dst.
3277// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3278FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3279{
3280 return vreinterpretq_m128i_u16(
3281 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3282}
3283
3284// Compare packed signed 32-bit integers in a and b for greater-than, and store
3285// the results in dst.
3286// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3287FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3288{
3289 return vreinterpretq_m128i_u32(
3290 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3291}
3292
3293// Compare packed signed 8-bit integers in a and b for greater-than, and store
3294// the results in dst.
3295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3296FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3297{
3298 return vreinterpretq_m128i_u8(
3299 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3300}
3301
3302// Compare packed double-precision (64-bit) floating-point elements in a and b
3303// for greater-than, and store the results in dst.
3304// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3305FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3306{
3307#if defined(__aarch64__) || defined(_M_ARM64)
3308 return vreinterpretq_m128d_u64(
3309 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3310#else
3311 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3312 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3313 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3314 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3315 uint64_t d[2];
3316 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3317 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3318
3319 return vreinterpretq_m128d_u64(vld1q_u64(d));
3320#endif
3321}
3322
3323// Compare the lower double-precision (64-bit) floating-point elements in a and
3324// b for greater-than, store the result in the lower element of dst, and copy
3325// the upper element from a to the upper element of dst.
3326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3327FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3328{
3329#if defined(__aarch64__) || defined(_M_ARM64)
3330 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3331#else
3332 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3333 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3334 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3335 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3336 uint64_t d[2];
3337 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3338 d[1] = a1;
3339
3340 return vreinterpretq_m128d_u64(vld1q_u64(d));
3341#endif
3342}
3343
3344// Compare packed double-precision (64-bit) floating-point elements in a and b
3345// for less-than-or-equal, and store the results in dst.
3346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3347FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3348{
3349#if defined(__aarch64__) || defined(_M_ARM64)
3350 return vreinterpretq_m128d_u64(
3351 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3352#else
3353 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3354 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3355 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3356 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3357 uint64_t d[2];
3358 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3359 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3360
3361 return vreinterpretq_m128d_u64(vld1q_u64(d));
3362#endif
3363}
3364
3365// Compare the lower double-precision (64-bit) floating-point elements in a and
3366// b for less-than-or-equal, store the result in the lower element of dst, and
3367// copy the upper element from a to the upper element of dst.
3368// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3369FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3370{
3371#if defined(__aarch64__) || defined(_M_ARM64)
3372 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3373#else
3374 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3375 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3376 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3377 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3378 uint64_t d[2];
3379 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3380 d[1] = a1;
3381
3382 return vreinterpretq_m128d_u64(vld1q_u64(d));
3383#endif
3384}
3385
3386// Compare packed signed 16-bit integers in a and b for less-than, and store the
3387// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3388// order of the operands switched.
3389// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3390FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3391{
3392 return vreinterpretq_m128i_u16(
3393 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3394}
3395
3396// Compare packed signed 32-bit integers in a and b for less-than, and store the
3397// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3398// order of the operands switched.
3399// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3400FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3401{
3402 return vreinterpretq_m128i_u32(
3403 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3404}
3405
3406// Compare packed signed 8-bit integers in a and b for less-than, and store the
3407// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3408// order of the operands switched.
3409// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3410FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3411{
3412 return vreinterpretq_m128i_u8(
3413 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3414}
3415
3416// Compare packed double-precision (64-bit) floating-point elements in a and b
3417// for less-than, and store the results in dst.
3418// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3419FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3420{
3421#if defined(__aarch64__) || defined(_M_ARM64)
3422 return vreinterpretq_m128d_u64(
3423 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3424#else
3425 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3426 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3427 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3428 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3429 uint64_t d[2];
3430 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3431 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3432
3433 return vreinterpretq_m128d_u64(vld1q_u64(d));
3434#endif
3435}
3436
3437// Compare the lower double-precision (64-bit) floating-point elements in a and
3438// b for less-than, store the result in the lower element of dst, and copy the
3439// upper element from a to the upper element of dst.
3440// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3441FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3442{
3443#if defined(__aarch64__) || defined(_M_ARM64)
3444 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3445#else
3446 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3447 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3448 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3449 uint64_t d[2];
3450 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3451 d[1] = a1;
3452
3453 return vreinterpretq_m128d_u64(vld1q_u64(d));
3454#endif
3455}
3456
3457// Compare packed double-precision (64-bit) floating-point elements in a and b
3458// for not-equal, and store the results in dst.
3459// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3460FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3461{
3462#if defined(__aarch64__) || defined(_M_ARM64)
3463 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3464 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3465#else
3466 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3467 uint32x4_t cmp =
3468 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3469 uint32x4_t swapped = vrev64q_u32(cmp);
3470 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3471#endif
3472}
3473
3474// Compare the lower double-precision (64-bit) floating-point elements in a and
3475// b for not-equal, store the result in the lower element of dst, and copy the
3476// upper element from a to the upper element of dst.
3477// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3478FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3479{
3480 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3481}
3482
3483// Compare packed double-precision (64-bit) floating-point elements in a and b
3484// for not-greater-than-or-equal, and store the results in dst.
3485// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3486FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3487{
3488#if defined(__aarch64__) || defined(_M_ARM64)
3489 return vreinterpretq_m128d_u64(veorq_u64(
3490 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3491 vdupq_n_u64(UINT64_MAX)));
3492#else
3493 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3494 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3495 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3496 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3497 uint64_t d[2];
3498 d[0] =
3499 !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3500 d[1] =
3501 !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3502
3503 return vreinterpretq_m128d_u64(vld1q_u64(d));
3504#endif
3505}
3506
3507// Compare the lower double-precision (64-bit) floating-point elements in a and
3508// b for not-greater-than-or-equal, store the result in the lower element of
3509// dst, and copy the upper element from a to the upper element of dst.
3510// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3511FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3512{
3513 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3514}
3515
3516// Compare packed double-precision (64-bit) floating-point elements in a and b
3517// for not-greater-than, and store the results in dst.
3518// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3519FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3520{
3521#if defined(__aarch64__) || defined(_M_ARM64)
3522 return vreinterpretq_m128d_u64(veorq_u64(
3523 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3524 vdupq_n_u64(UINT64_MAX)));
3525#else
3526 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3527 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3528 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3529 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3530 uint64_t d[2];
3531 d[0] =
3532 !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3533 d[1] =
3534 !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3535
3536 return vreinterpretq_m128d_u64(vld1q_u64(d));
3537#endif
3538}
3539
3540// Compare the lower double-precision (64-bit) floating-point elements in a and
3541// b for not-greater-than, store the result in the lower element of dst, and
3542// copy the upper element from a to the upper element of dst.
3543// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3544FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3545{
3546 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3547}
3548
3549// Compare packed double-precision (64-bit) floating-point elements in a and b
3550// for not-less-than-or-equal, and store the results in dst.
3551// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3552FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3553{
3554#if defined(__aarch64__) || defined(_M_ARM64)
3555 return vreinterpretq_m128d_u64(veorq_u64(
3556 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3557 vdupq_n_u64(UINT64_MAX)));
3558#else
3559 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3560 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3561 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3562 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3563 uint64_t d[2];
3564 d[0] =
3565 !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3566 d[1] =
3567 !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3568
3569 return vreinterpretq_m128d_u64(vld1q_u64(d));
3570#endif
3571}
3572
3573// Compare the lower double-precision (64-bit) floating-point elements in a and
3574// b for not-less-than-or-equal, store the result in the lower element of dst,
3575// and copy the upper element from a to the upper element of dst.
3576// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3577FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3578{
3579 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3580}
3581
3582// Compare packed double-precision (64-bit) floating-point elements in a and b
3583// for not-less-than, and store the results in dst.
3584// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3585FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3586{
3587#if defined(__aarch64__) || defined(_M_ARM64)
3588 return vreinterpretq_m128d_u64(veorq_u64(
3589 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3590 vdupq_n_u64(UINT64_MAX)));
3591#else
3592 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3593 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3594 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3595 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3596 uint64_t d[2];
3597 d[0] =
3598 !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3599 d[1] =
3600 !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3601
3602 return vreinterpretq_m128d_u64(vld1q_u64(d));
3603#endif
3604}
3605
3606// Compare the lower double-precision (64-bit) floating-point elements in a and
3607// b for not-less-than, store the result in the lower element of dst, and copy
3608// the upper element from a to the upper element of dst.
3609// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3610FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3611{
3612 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3613}
3614
3615// Compare packed double-precision (64-bit) floating-point elements in a and b
3616// to see if neither is NaN, and store the results in dst.
3617// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3618FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3619{
3620#if defined(__aarch64__) || defined(_M_ARM64)
3621 // Excluding NaNs, any two floating point numbers can be compared.
3622 uint64x2_t not_nan_a =
3623 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3624 uint64x2_t not_nan_b =
3625 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3626 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3627#else
3628 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3629 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3630 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3631 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3632 uint64_t d[2];
3633 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3634 (*(double *) &b0) == (*(double *) &b0))
3635 ? ~UINT64_C(0)
3636 : UINT64_C(0);
3637 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3638 (*(double *) &b1) == (*(double *) &b1))
3639 ? ~UINT64_C(0)
3640 : UINT64_C(0);
3641
3642 return vreinterpretq_m128d_u64(vld1q_u64(d));
3643#endif
3644}
3645
3646// Compare the lower double-precision (64-bit) floating-point elements in a and
3647// b to see if neither is NaN, store the result in the lower element of dst, and
3648// copy the upper element from a to the upper element of dst.
3649// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3650FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3651{
3652#if defined(__aarch64__) || defined(_M_ARM64)
3653 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3654#else
3655 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3656 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3657 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3658 uint64_t d[2];
3659 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3660 (*(double *) &b0) == (*(double *) &b0))
3661 ? ~UINT64_C(0)
3662 : UINT64_C(0);
3663 d[1] = a1;
3664
3665 return vreinterpretq_m128d_u64(vld1q_u64(d));
3666#endif
3667}
3668
3669// Compare packed double-precision (64-bit) floating-point elements in a and b
3670// to see if either is NaN, and store the results in dst.
3671// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3672FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3673{
3674#if defined(__aarch64__) || defined(_M_ARM64)
3675 // Two NaNs are not equal in comparison operation.
3676 uint64x2_t not_nan_a =
3677 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3678 uint64x2_t not_nan_b =
3679 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3680 return vreinterpretq_m128d_s32(
3681 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3682#else
3683 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3684 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3685 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3686 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3687 uint64_t d[2];
3688 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3689 (*(double *) &b0) == (*(double *) &b0))
3690 ? UINT64_C(0)
3691 : ~UINT64_C(0);
3692 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3693 (*(double *) &b1) == (*(double *) &b1))
3694 ? UINT64_C(0)
3695 : ~UINT64_C(0);
3696
3697 return vreinterpretq_m128d_u64(vld1q_u64(d));
3698#endif
3699}
3700
3701// Compare the lower double-precision (64-bit) floating-point elements in a and
3702// b to see if either is NaN, store the result in the lower element of dst, and
3703// copy the upper element from a to the upper element of dst.
3704// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3705FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3706{
3707#if defined(__aarch64__) || defined(_M_ARM64)
3708 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3709#else
3710 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3711 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3712 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3713 uint64_t d[2];
3714 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3715 (*(double *) &b0) == (*(double *) &b0))
3716 ? UINT64_C(0)
3717 : ~UINT64_C(0);
3718 d[1] = a1;
3719
3720 return vreinterpretq_m128d_u64(vld1q_u64(d));
3721#endif
3722}
3723
3724// Compare the lower double-precision (64-bit) floating-point element in a and b
3725// for greater-than-or-equal, and return the boolean result (0 or 1).
3726// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3727FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3728{
3729#if defined(__aarch64__) || defined(_M_ARM64)
3730 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3731#else
3732 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3733 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3734
3735 return (*(double *) &a0 >= *(double *) &b0);
3736#endif
3737}
3738
3739// Compare the lower double-precision (64-bit) floating-point element in a and b
3740// for greater-than, and return the boolean result (0 or 1).
3741// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3742FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3743{
3744#if defined(__aarch64__) || defined(_M_ARM64)
3745 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3746#else
3747 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3748 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3749
3750 return (*(double *) &a0 > *(double *) &b0);
3751#endif
3752}
3753
3754// Compare the lower double-precision (64-bit) floating-point element in a and b
3755// for less-than-or-equal, and return the boolean result (0 or 1).
3756// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3757FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3758{
3759#if defined(__aarch64__) || defined(_M_ARM64)
3760 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3761#else
3762 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3763 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3764
3765 return (*(double *) &a0 <= *(double *) &b0);
3766#endif
3767}
3768
3769// Compare the lower double-precision (64-bit) floating-point element in a and b
3770// for less-than, and return the boolean result (0 or 1).
3771// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3772FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3773{
3774#if defined(__aarch64__) || defined(_M_ARM64)
3775 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3776#else
3777 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3778 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3779
3780 return (*(double *) &a0 < *(double *) &b0);
3781#endif
3782}
3783
3784// Compare the lower double-precision (64-bit) floating-point element in a and b
3785// for equality, and return the boolean result (0 or 1).
3786// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3787FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3788{
3789#if defined(__aarch64__) || defined(_M_ARM64)
3790 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3791#else
3792 uint32x4_t a_not_nan =
3793 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3794 uint32x4_t b_not_nan =
3795 vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3796 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3797 uint32x4_t a_eq_b =
3798 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3799 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3800 vreinterpretq_u64_u32(a_eq_b));
3801 return vgetq_lane_u64(and_results, 0) & 0x1;
3802#endif
3803}
3804
3805// Compare the lower double-precision (64-bit) floating-point element in a and b
3806// for not-equal, and return the boolean result (0 or 1).
3807// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3808FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3809{
3810 return !_mm_comieq_sd(a, b);
3811}
3812
3813// Convert packed signed 32-bit integers in a to packed double-precision
3814// (64-bit) floating-point elements, and store the results in dst.
3815// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3816FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3817{
3818#if defined(__aarch64__) || defined(_M_ARM64)
3819 return vreinterpretq_m128d_f64(
3820 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3821#else
3822 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3823 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3824 return _mm_set_pd(a1, a0);
3825#endif
3826}
3827
3828// Convert packed signed 32-bit integers in a to packed single-precision
3829// (32-bit) floating-point elements, and store the results in dst.
3830// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3831FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3832{
3833 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3834}
3835
3836// Convert packed double-precision (64-bit) floating-point elements in a to
3837// packed 32-bit integers, and store the results in dst.
3838// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3839FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3840{
3841// vrnd32xq_f64 not supported on clang
3842#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3843 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3844 int64x2_t integers = vcvtq_s64_f64(rounded);
3845 return vreinterpretq_m128i_s32(
3846 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3847#else
3848 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3849 double d0 = ((double *) &rnd)[0];
3850 double d1 = ((double *) &rnd)[1];
3851 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3852#endif
3853}
3854
3855// Convert packed double-precision (64-bit) floating-point elements in a to
3856// packed 32-bit integers, and store the results in dst.
3857// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3858FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3859{
3860 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3861 double d0 = ((double *) &rnd)[0];
3862 double d1 = ((double *) &rnd)[1];
3863 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3864 return vreinterpret_m64_s32(vld1_s32(data));
3865}
3866
3867// Convert packed double-precision (64-bit) floating-point elements in a to
3868// packed single-precision (32-bit) floating-point elements, and store the
3869// results in dst.
3870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3871FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3872{
3873#if defined(__aarch64__) || defined(_M_ARM64)
3874 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3875 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3876#else
3877 float a0 = (float) ((double *) &a)[0];
3878 float a1 = (float) ((double *) &a)[1];
3879 return _mm_set_ps(0, 0, a1, a0);
3880#endif
3881}
3882
3883// Convert packed signed 32-bit integers in a to packed double-precision
3884// (64-bit) floating-point elements, and store the results in dst.
3885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3886FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3887{
3888#if defined(__aarch64__) || defined(_M_ARM64)
3889 return vreinterpretq_m128d_f64(
3890 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3891#else
3892 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3893 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3894 return _mm_set_pd(a1, a0);
3895#endif
3896}
3897
3898// Convert packed single-precision (32-bit) floating-point elements in a to
3899// packed 32-bit integers, and store the results in dst.
3900// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3901// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3902// does not support! It is supported on ARMv8-A however.
3903FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3904{
3905#if defined(__ARM_FEATURE_FRINT)
3906 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3907#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3908 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3909 switch (_MM_GET_ROUNDING_MODE()) {
3910 case _MM_ROUND_NEAREST:
3911 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3912 case _MM_ROUND_DOWN:
3913 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3914 case _MM_ROUND_UP:
3915 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3916 default: // _MM_ROUND_TOWARD_ZERO
3917 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3918 }
3919#else
3920 float *f = (float *) &a;
3921 switch (_MM_GET_ROUNDING_MODE()) {
3922 case _MM_ROUND_NEAREST: {
3923 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3924 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3925 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3926 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3927 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3928 int32x4_t r_trunc = vcvtq_s32_f32(
3929 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3930 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3931 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3932 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3933 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3934 float32x4_t delta = vsubq_f32(
3935 vreinterpretq_f32_m128(a),
3936 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3937 uint32x4_t is_delta_half =
3938 vceqq_f32(delta, half); /* delta == +/- 0.5 */
3939 return vreinterpretq_m128i_s32(
3940 vbslq_s32(is_delta_half, r_even, r_normal));
3941 }
3942 case _MM_ROUND_DOWN:
3943 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3944 floorf(f[0]));
3945 case _MM_ROUND_UP:
3946 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3947 ceilf(f[0]));
3948 default: // _MM_ROUND_TOWARD_ZERO
3949 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3950 (int32_t) f[0]);
3951 }
3952#endif
3953}
3954
3955// Convert packed single-precision (32-bit) floating-point elements in a to
3956// packed double-precision (64-bit) floating-point elements, and store the
3957// results in dst.
3958// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
3959FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3960{
3961#if defined(__aarch64__) || defined(_M_ARM64)
3962 return vreinterpretq_m128d_f64(
3963 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3964#else
3965 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3966 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3967 return _mm_set_pd(a1, a0);
3968#endif
3969}
3970
3971// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3972// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
3973FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3974{
3975#if defined(__aarch64__) || defined(_M_ARM64)
3976 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3977#else
3978 return ((double *) &a)[0];
3979#endif
3980}
3981
3982// Convert the lower double-precision (64-bit) floating-point element in a to a
3983// 32-bit integer, and store the result in dst.
3984// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
3985FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3986{
3987#if defined(__aarch64__) || defined(_M_ARM64)
3988 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3989#else
3990 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3991 double ret = ((double *) &rnd)[0];
3992 return (int32_t) ret;
3993#endif
3994}
3995
3996// Convert the lower double-precision (64-bit) floating-point element in a to a
3997// 64-bit integer, and store the result in dst.
3998// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
3999FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
4000{
4001#if defined(__aarch64__) || defined(_M_ARM64)
4002 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4003#else
4004 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
4005 double ret = ((double *) &rnd)[0];
4006 return (int64_t) ret;
4007#endif
4008}
4009
4010// Convert the lower double-precision (64-bit) floating-point element in a to a
4011// 64-bit integer, and store the result in dst.
4012// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
4013#define _mm_cvtsd_si64x _mm_cvtsd_si64
4014
4015// Convert the lower double-precision (64-bit) floating-point element in b to a
4016// single-precision (32-bit) floating-point element, store the result in the
4017// lower element of dst, and copy the upper 3 packed elements from a to the
4018// upper elements of dst.
4019// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4020FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4021{
4022#if defined(__aarch64__) || defined(_M_ARM64)
4023 return vreinterpretq_m128_f32(vsetq_lane_f32(
4024 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4025 vreinterpretq_f32_m128(a), 0));
4026#else
4027 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4028 vreinterpretq_f32_m128(a), 0));
4029#endif
4030}
4031
4032// Copy the lower 32-bit integer in a to dst.
4033// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4034FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4035{
4036 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4037}
4038
4039// Copy the lower 64-bit integer in a to dst.
4040// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4041FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4042{
4043 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4044}
4045
4046// Copy the lower 64-bit integer in a to dst.
4047// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4048#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4049
4050// Convert the signed 32-bit integer b to a double-precision (64-bit)
4051// floating-point element, store the result in the lower element of dst, and
4052// copy the upper element from a to the upper element of dst.
4053// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4054FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4055{
4056#if defined(__aarch64__) || defined(_M_ARM64)
4057 return vreinterpretq_m128d_f64(
4058 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4059#else
4060 double bf = (double) b;
4061 return vreinterpretq_m128d_s64(
4062 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4063#endif
4064}
4065
4066// Copy the lower 64-bit integer in a to dst.
4067// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4068#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4069
4070// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4071// elements of dst.
4072// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4073FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4074{
4075 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4076}
4077
4078// Convert the signed 64-bit integer b to a double-precision (64-bit)
4079// floating-point element, store the result in the lower element of dst, and
4080// copy the upper element from a to the upper element of dst.
4081// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4082FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4083{
4084#if defined(__aarch64__) || defined(_M_ARM64)
4085 return vreinterpretq_m128d_f64(
4086 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4087#else
4088 double bf = (double) b;
4089 return vreinterpretq_m128d_s64(
4090 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4091#endif
4092}
4093
4094// Copy 64-bit integer a to the lower element of dst, and zero the upper
4095// element.
4096// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4097FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4098{
4099 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4100}
4101
4102// Copy 64-bit integer a to the lower element of dst, and zero the upper
4103// element.
4104// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4105#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4106
4107// Convert the signed 64-bit integer b to a double-precision (64-bit)
4108// floating-point element, store the result in the lower element of dst, and
4109// copy the upper element from a to the upper element of dst.
4110// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4111#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4112
4113// Convert the lower single-precision (32-bit) floating-point element in b to a
4114// double-precision (64-bit) floating-point element, store the result in the
4115// lower element of dst, and copy the upper element from a to the upper element
4116// of dst.
4117// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4118FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4119{
4120 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4121#if defined(__aarch64__) || defined(_M_ARM64)
4122 return vreinterpretq_m128d_f64(
4123 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4124#else
4125 return vreinterpretq_m128d_s64(
4126 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4127#endif
4128}
4129
4130// Convert packed double-precision (64-bit) floating-point elements in a to
4131// packed 32-bit integers with truncation, and store the results in dst.
4132// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4133FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4134{
4135 double a0 = ((double *) &a)[0];
4136 double a1 = ((double *) &a)[1];
4137 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4138}
4139
4140// Convert packed double-precision (64-bit) floating-point elements in a to
4141// packed 32-bit integers with truncation, and store the results in dst.
4142// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4143FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4144{
4145 double a0 = ((double *) &a)[0];
4146 double a1 = ((double *) &a)[1];
4147 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4148 return vreinterpret_m64_s32(vld1_s32(data));
4149}
4150
4151// Convert packed single-precision (32-bit) floating-point elements in a to
4152// packed 32-bit integers with truncation, and store the results in dst.
4153// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4154FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4155{
4156 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4157}
4158
4159// Convert the lower double-precision (64-bit) floating-point element in a to a
4160// 32-bit integer with truncation, and store the result in dst.
4161// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4162FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4163{
4164 double ret = *((double *) &a);
4165 return (int32_t) ret;
4166}
4167
4168// Convert the lower double-precision (64-bit) floating-point element in a to a
4169// 64-bit integer with truncation, and store the result in dst.
4170// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4171FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4172{
4173#if defined(__aarch64__) || defined(_M_ARM64)
4174 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4175#else
4176 double ret = *((double *) &a);
4177 return (int64_t) ret;
4178#endif
4179}
4180
4181// Convert the lower double-precision (64-bit) floating-point element in a to a
4182// 64-bit integer with truncation, and store the result in dst.
4183// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4184#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4185
4186// Divide packed double-precision (64-bit) floating-point elements in a by
4187// packed elements in b, and store the results in dst.
4188// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4189FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4190{
4191#if defined(__aarch64__) || defined(_M_ARM64)
4192 return vreinterpretq_m128d_f64(
4193 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4194#else
4195 double *da = (double *) &a;
4196 double *db = (double *) &b;
4197 double c[2];
4198 c[0] = da[0] / db[0];
4199 c[1] = da[1] / db[1];
4200 return vld1q_f32((float32_t *) c);
4201#endif
4202}
4203
4204// Divide the lower double-precision (64-bit) floating-point element in a by the
4205// lower double-precision (64-bit) floating-point element in b, store the result
4206// in the lower element of dst, and copy the upper element from a to the upper
4207// element of dst.
4208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4209FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4210{
4211#if defined(__aarch64__) || defined(_M_ARM64)
4212 float64x2_t tmp =
4213 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4214 return vreinterpretq_m128d_f64(
4215 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4216#else
4217 return _mm_move_sd(a, _mm_div_pd(a, b));
4218#endif
4219}
4220
4221// Extract a 16-bit integer from a, selected with imm8, and store the result in
4222// the lower element of dst.
4223// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4224// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4225#define _mm_extract_epi16(a, imm) \
4226 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4227
4228// Copy a to dst, and insert the 16-bit integer i into dst at the location
4229// specified by imm8.
4230// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4231// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4232// __constrange(0,8) int imm)
4233#define _mm_insert_epi16(a, b, imm) \
4234 vreinterpretq_m128i_s16( \
4235 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4236
4237// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4238// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4239// boundary or a general-protection exception may be generated.
4240// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4241FORCE_INLINE __m128d _mm_load_pd(const double *p)
4242{
4243#if defined(__aarch64__) || defined(_M_ARM64)
4244 return vreinterpretq_m128d_f64(vld1q_f64(p));
4245#else
4246 const float *fp = (const float *) p;
4247 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4248 return vreinterpretq_m128d_f32(vld1q_f32(data));
4249#endif
4250}
4251
4252// Load a double-precision (64-bit) floating-point element from memory into both
4253// elements of dst.
4254// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4255#define _mm_load_pd1 _mm_load1_pd
4256
4257// Load a double-precision (64-bit) floating-point element from memory into the
4258// lower of dst, and zero the upper element. mem_addr does not need to be
4259// aligned on any particular boundary.
4260// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4261FORCE_INLINE __m128d _mm_load_sd(const double *p)
4262{
4263#if defined(__aarch64__) || defined(_M_ARM64)
4264 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4265#else
4266 const float *fp = (const float *) p;
4267 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4268 return vreinterpretq_m128d_f32(vld1q_f32(data));
4269#endif
4270}
4271
4272// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4273// on a 16-byte boundary or a general-protection exception may be generated.
4274// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4275FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4276{
4277 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4278}
4279
4280// Load a double-precision (64-bit) floating-point element from memory into both
4281// elements of dst.
4282// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4283FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4284{
4285#if defined(__aarch64__) || defined(_M_ARM64)
4286 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4287#else
4288 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4289#endif
4290}
4291
4292// Load a double-precision (64-bit) floating-point element from memory into the
4293// upper element of dst, and copy the lower element from a to dst. mem_addr does
4294// not need to be aligned on any particular boundary.
4295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4296FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4297{
4298#if defined(__aarch64__) || defined(_M_ARM64)
4299 return vreinterpretq_m128d_f64(
4300 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4301#else
4302 return vreinterpretq_m128d_f32(vcombine_f32(
4303 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4304#endif
4305}
4306
4307// Load 64-bit integer from memory into the first element of dst.
4308// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4309FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4310{
4311 /* Load the lower 64 bits of the value pointed to by p into the
4312 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4313 */
4314 return vreinterpretq_m128i_s32(
4315 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4316}
4317
4318// Load a double-precision (64-bit) floating-point element from memory into the
4319// lower element of dst, and copy the upper element from a to dst. mem_addr does
4320// not need to be aligned on any particular boundary.
4321// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4322FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4323{
4324#if defined(__aarch64__) || defined(_M_ARM64)
4325 return vreinterpretq_m128d_f64(
4326 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4327#else
4328 return vreinterpretq_m128d_f32(
4329 vcombine_f32(vld1_f32((const float *) p),
4330 vget_high_f32(vreinterpretq_f32_m128d(a))));
4331#endif
4332}
4333
4334// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4335// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4336// general-protection exception may be generated.
4337// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4338FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4339{
4340#if defined(__aarch64__) || defined(_M_ARM64)
4341 float64x2_t v = vld1q_f64(p);
4342 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4343#else
4344 int64x2_t v = vld1q_s64((const int64_t *) p);
4345 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4346#endif
4347}
4348
4349// Loads two double-precision from unaligned memory, floating-point values.
4350// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4351FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4352{
4353 return _mm_load_pd(p);
4354}
4355
4356// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4357// be aligned on any particular boundary.
4358// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4359FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4360{
4361 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4362}
4363
4364// Load unaligned 32-bit integer from memory into the first element of dst.
4365// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4366FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4367{
4368 return vreinterpretq_m128i_s32(
4369 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4370}
4371
4372// Multiply packed signed 16-bit integers in a and b, producing intermediate
4373// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4374// 32-bit integers, and pack the results in dst.
4375// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4376FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4377{
4378 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4379 vget_low_s16(vreinterpretq_s16_m128i(b)));
4380#if defined(__aarch64__) || defined(_M_ARM64)
4381 int32x4_t high =
4382 vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4383
4384 return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4385#else
4386 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4387 vget_high_s16(vreinterpretq_s16_m128i(b)));
4388
4389 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4390 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4391
4392 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4393#endif
4394}
4395
4396// Conditionally store 8-bit integer elements from a into memory using mask
4397// (elements are not stored when the highest bit is not set in the corresponding
4398// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4399// on any particular boundary.
4400// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4401FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4402{
4403 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4404 __m128 b = _mm_load_ps((const float *) mem_addr);
4405 int8x16_t masked =
4406 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4407 vreinterpretq_s8_m128(b));
4408 vst1q_s8((int8_t *) mem_addr, masked);
4409}
4410
4411// Compare packed signed 16-bit integers in a and b, and store packed maximum
4412// values in dst.
4413// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4414FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4415{
4416 return vreinterpretq_m128i_s16(
4417 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4418}
4419
4420// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4421// values in dst.
4422// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4423FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4424{
4425 return vreinterpretq_m128i_u8(
4426 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4427}
4428
4429// Compare packed double-precision (64-bit) floating-point elements in a and b,
4430// and store packed maximum values in dst.
4431// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4432FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4433{
4434#if defined(__aarch64__) || defined(_M_ARM64)
4435#if SSE2NEON_PRECISE_MINMAX
4436 float64x2_t _a = vreinterpretq_f64_m128d(a);
4437 float64x2_t _b = vreinterpretq_f64_m128d(b);
4438 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4439#else
4440 return vreinterpretq_m128d_f64(
4441 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4442#endif
4443#else
4444 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4445 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4446 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4447 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4448 uint64_t d[2];
4449 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4450 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4451
4452 return vreinterpretq_m128d_u64(vld1q_u64(d));
4453#endif
4454}
4455
4456// Compare the lower double-precision (64-bit) floating-point elements in a and
4457// b, store the maximum value in the lower element of dst, and copy the upper
4458// element from a to the upper element of dst.
4459// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4460FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4461{
4462#if defined(__aarch64__) || defined(_M_ARM64)
4463 return _mm_move_sd(a, _mm_max_pd(a, b));
4464#else
4465 double *da = (double *) &a;
4466 double *db = (double *) &b;
4467 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4468 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4469#endif
4470}
4471
4472// Compare packed signed 16-bit integers in a and b, and store packed minimum
4473// values in dst.
4474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4475FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4476{
4477 return vreinterpretq_m128i_s16(
4478 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4479}
4480
4481// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4482// values in dst.
4483// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4484FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4485{
4486 return vreinterpretq_m128i_u8(
4487 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4488}
4489
4490// Compare packed double-precision (64-bit) floating-point elements in a and b,
4491// and store packed minimum values in dst.
4492// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4493FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4494{
4495#if defined(__aarch64__) || defined(_M_ARM64)
4496#if SSE2NEON_PRECISE_MINMAX
4497 float64x2_t _a = vreinterpretq_f64_m128d(a);
4498 float64x2_t _b = vreinterpretq_f64_m128d(b);
4499 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4500#else
4501 return vreinterpretq_m128d_f64(
4502 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4503#endif
4504#else
4505 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4506 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4507 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4508 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4509 uint64_t d[2];
4510 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4511 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4512 return vreinterpretq_m128d_u64(vld1q_u64(d));
4513#endif
4514}
4515
4516// Compare the lower double-precision (64-bit) floating-point elements in a and
4517// b, store the minimum value in the lower element of dst, and copy the upper
4518// element from a to the upper element of dst.
4519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4520FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4521{
4522#if defined(__aarch64__) || defined(_M_ARM64)
4523 return _mm_move_sd(a, _mm_min_pd(a, b));
4524#else
4525 double *da = (double *) &a;
4526 double *db = (double *) &b;
4527 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4528 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4529#endif
4530}
4531
4532// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4533// upper element.
4534// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4535FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4536{
4537 return vreinterpretq_m128i_s64(
4538 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4539}
4540
4541// Move the lower double-precision (64-bit) floating-point element from b to the
4542// lower element of dst, and copy the upper element from a to the upper element
4543// of dst.
4544// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4545FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4546{
4547 return vreinterpretq_m128d_f32(
4548 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4549 vget_high_f32(vreinterpretq_f32_m128d(a))));
4550}
4551
4552// Create mask from the most significant bit of each 8-bit element in a, and
4553// store the result in dst.
4554// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4555FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4556{
4557 // Use increasingly wide shifts+adds to collect the sign bits
4558 // together.
4559 // Since the widening shifts would be rather confusing to follow in little
4560 // endian, everything will be illustrated in big endian order instead. This
4561 // has a different result - the bits would actually be reversed on a big
4562 // endian machine.
4563
4564 // Starting input (only half the elements are shown):
4565 // 89 ff 1d c0 00 10 99 33
4566 uint8x16_t input = vreinterpretq_u8_m128i(a);
4567
4568 // Shift out everything but the sign bits with an unsigned shift right.
4569 //
4570 // Bytes of the vector::
4571 // 89 ff 1d c0 00 10 99 33
4572 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4573 // | | | | | | | |
4574 // 01 01 00 01 00 00 01 00
4575 //
4576 // Bits of first important lane(s):
4577 // 10001001 (89)
4578 // \______
4579 // |
4580 // 00000001 (01)
4581 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4582
4583 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4584 // 'xx' represents garbage data which will be ignored in the final result.
4585 // In the important bytes, the add functions like a binary OR.
4586 //
4587 // 01 01 00 01 00 00 01 00
4588 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4589 // \| \| \| \|
4590 // xx 03 xx 01 xx 00 xx 02
4591 //
4592 // 00000001 00000001 (01 01)
4593 // \_______ |
4594 // \|
4595 // xxxxxxxx xxxxxx11 (xx 03)
4596 uint32x4_t paired16 =
4597 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4598
4599 // Repeat with a wider 32-bit shift + add.
4600 // xx 03 xx 01 xx 00 xx 02
4601 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4602 // 14))
4603 // \| \|
4604 // xx xx xx 0d xx xx xx 02
4605 //
4606 // 00000011 00000001 (03 01)
4607 // \\_____ ||
4608 // '----.\||
4609 // xxxxxxxx xxxx1101 (xx 0d)
4610 uint64x2_t paired32 =
4611 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4612
4613 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4614 // lanes. xx xx xx 0d xx xx xx 02
4615 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4616 // 28))
4617 // \|
4618 // xx xx xx xx xx xx xx d2
4619 //
4620 // 00001101 00000010 (0d 02)
4621 // \ \___ | |
4622 // '---. \| |
4623 // xxxxxxxx 11010010 (xx d2)
4624 uint8x16_t paired64 =
4625 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4626
4627 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4628 // xx xx xx xx xx xx xx d2
4629 // || return paired64[0]
4630 // d2
4631 // Note: Little endian would return the correct value 4b (01001011) instead.
4632 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4633}
4634
4635// Set each bit of mask dst based on the most significant bit of the
4636// corresponding packed double-precision (64-bit) floating-point element in a.
4637// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4638FORCE_INLINE int _mm_movemask_pd(__m128d a)
4639{
4640 uint64x2_t input = vreinterpretq_u64_m128d(a);
4641 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4642 return (int) (vgetq_lane_u64(high_bits, 0) |
4643 (vgetq_lane_u64(high_bits, 1) << 1));
4644}
4645
4646// Copy the lower 64-bit integer in a to dst.
4647// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4648FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4649{
4650 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4651}
4652
4653// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4654// element.
4655// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4656FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4657{
4658 return vreinterpretq_m128i_s64(
4659 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4660}
4661
4662// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4663// a and b, and store the unsigned 64-bit results in dst.
4664// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4665FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4666{
4667 // vmull_u32 upcasts instead of masking, so we downcast.
4668 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4669 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4670 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4671}
4672
4673// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4674// and store the results in dst.
4675// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4676FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4677{
4678#if defined(__aarch64__) || defined(_M_ARM64)
4679 return vreinterpretq_m128d_f64(
4680 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4681#else
4682 double *da = (double *) &a;
4683 double *db = (double *) &b;
4684 double c[2];
4685 c[0] = da[0] * db[0];
4686 c[1] = da[1] * db[1];
4687 return vld1q_f32((float32_t *) c);
4688#endif
4689}
4690
4691// Multiply the lower double-precision (64-bit) floating-point element in a and
4692// b, store the result in the lower element of dst, and copy the upper element
4693// from a to the upper element of dst.
4694// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4695FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4696{
4697 return _mm_move_sd(a, _mm_mul_pd(a, b));
4698}
4699
4700// Multiply the low unsigned 32-bit integers from a and b, and store the
4701// unsigned 64-bit result in dst.
4702// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4703FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4704{
4705 return vreinterpret_m64_u64(vget_low_u64(
4706 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4707}
4708
4709// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4710// 32-bit integers, and store the high 16 bits of the intermediate integers in
4711// dst.
4712// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4713FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4714{
4715 /* FIXME: issue with large values because of result saturation */
4716 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4717 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4718 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4719 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4720 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4721 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4722 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4723 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4724 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4725 uint16x8x2_t r =
4726 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4727 return vreinterpretq_m128i_u16(r.val[1]);
4728}
4729
4730// Multiply the packed unsigned 16-bit integers in a and b, producing
4731// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4732// integers in dst.
4733// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4734FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4735{
4736 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4737 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4738 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4739#if defined(__aarch64__) || defined(_M_ARM64)
4740 uint32x4_t ab7654 =
4741 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4742 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4743 vreinterpretq_u16_u32(ab7654));
4744 return vreinterpretq_m128i_u16(r);
4745#else
4746 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4747 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4748 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4749 uint16x8x2_t r =
4750 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4751 return vreinterpretq_m128i_u16(r.val[1]);
4752#endif
4753}
4754
4755// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4756// integers, and store the low 16 bits of the intermediate integers in dst.
4757// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4758FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4759{
4760 return vreinterpretq_m128i_s16(
4761 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4762}
4763
4764// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4765// elements in a and b, and store the results in dst.
4766// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4767FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4768{
4769 return vreinterpretq_m128d_s64(
4770 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4771}
4772
4773// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4774// and store the result in dst.
4775// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4776FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4777{
4778 return vreinterpretq_m128i_s32(
4779 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4780}
4781
4782// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4783// using signed saturation, and store the results in dst.
4784// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4785FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4786{
4787 return vreinterpretq_m128i_s8(
4788 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4789 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4790}
4791
4792// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4793// using signed saturation, and store the results in dst.
4794// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4795FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4796{
4797 return vreinterpretq_m128i_s16(
4798 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4799 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4800}
4801
4802// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4803// using unsigned saturation, and store the results in dst.
4804// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4805FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4806{
4807 return vreinterpretq_m128i_u8(
4808 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4809 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4810}
4811
4812// Pause the processor. This is typically used in spin-wait loops and depending
4813// on the x86 processor typical values are in the 40-100 cycle range. The
4814// 'yield' instruction isn't a good fit because it's effectively a nop on most
4815// Arm cores. Experience with several databases has shown has shown an 'isb' is
4816// a reasonable approximation.
4817// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4818FORCE_INLINE void _mm_pause(void)
4819{
4820#if defined(_MSC_VER)
4821 __isb(_ARM64_BARRIER_SY);
4822#else
4823 __asm__ __volatile__("isb\n");
4824#endif
4825}
4826
4827// Compute the absolute differences of packed unsigned 8-bit integers in a and
4828// b, then horizontally sum each consecutive 8 differences to produce two
4829// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4830// 16 bits of 64-bit elements in dst.
4831// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4832FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4833{
4834 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4835 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4836}
4837
4838// Set packed 16-bit integers in dst with the supplied values.
4839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4840FORCE_INLINE __m128i _mm_set_epi16(short i7,
4841 short i6,
4842 short i5,
4843 short i4,
4844 short i3,
4845 short i2,
4846 short i1,
4847 short i0)
4848{
4849 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4850 return vreinterpretq_m128i_s16(vld1q_s16(data));
4851}
4852
4853// Set packed 32-bit integers in dst with the supplied values.
4854// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4855FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4856{
4857 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4858 return vreinterpretq_m128i_s32(vld1q_s32(data));
4859}
4860
4861// Set packed 64-bit integers in dst with the supplied values.
4862// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4863FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4864{
4865 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4866}
4867
4868// Set packed 64-bit integers in dst with the supplied values.
4869// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4870FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4871{
4872 return vreinterpretq_m128i_s64(
4873 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4874}
4875
4876// Set packed 8-bit integers in dst with the supplied values.
4877// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4878FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4879 signed char b14,
4880 signed char b13,
4881 signed char b12,
4882 signed char b11,
4883 signed char b10,
4884 signed char b9,
4885 signed char b8,
4886 signed char b7,
4887 signed char b6,
4888 signed char b5,
4889 signed char b4,
4890 signed char b3,
4891 signed char b2,
4892 signed char b1,
4893 signed char b0)
4894{
4895 int8_t ALIGN_STRUCT(16)
4896 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4897 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4898 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4899 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4900 return (__m128i) vld1q_s8(data);
4901}
4902
4903// Set packed double-precision (64-bit) floating-point elements in dst with the
4904// supplied values.
4905// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
4906FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4907{
4908 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4909#if defined(__aarch64__) || defined(_M_ARM64)
4910 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4911#else
4912 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4913#endif
4914}
4915
4916// Broadcast double-precision (64-bit) floating-point value a to all elements of
4917// dst.
4918// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
4919#define _mm_set_pd1 _mm_set1_pd
4920
4921// Copy double-precision (64-bit) floating-point element a to the lower element
4922// of dst, and zero the upper element.
4923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
4924FORCE_INLINE __m128d _mm_set_sd(double a)
4925{
4926#if defined(__aarch64__) || defined(_M_ARM64)
4927 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4928#else
4929 return _mm_set_pd(0, a);
4930#endif
4931}
4932
4933// Broadcast 16-bit integer a to all elements of dst.
4934// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
4935FORCE_INLINE __m128i _mm_set1_epi16(short w)
4936{
4937 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4938}
4939
4940// Broadcast 32-bit integer a to all elements of dst.
4941// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
4942FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4943{
4944 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4945}
4946
4947// Broadcast 64-bit integer a to all elements of dst.
4948// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
4949FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
4950{
4951 return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
4952}
4953
4954// Broadcast 64-bit integer a to all elements of dst.
4955// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
4956FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4957{
4958 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4959}
4960
4961// Broadcast 8-bit integer a to all elements of dst.
4962// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
4963FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4964{
4965 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4966}
4967
4968// Broadcast double-precision (64-bit) floating-point value a to all elements of
4969// dst.
4970// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
4971FORCE_INLINE __m128d _mm_set1_pd(double d)
4972{
4973#if defined(__aarch64__) || defined(_M_ARM64)
4974 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4975#else
4976 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4977#endif
4978}
4979
4980// Set packed 16-bit integers in dst with the supplied values in reverse order.
4981// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
4982FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4983 short w1,
4984 short w2,
4985 short w3,
4986 short w4,
4987 short w5,
4988 short w6,
4989 short w7)
4990{
4991 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
4992 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
4993}
4994
4995// Set packed 32-bit integers in dst with the supplied values in reverse order.
4996// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
4997FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
4998{
4999 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5000 return vreinterpretq_m128i_s32(vld1q_s32(data));
5001}
5002
5003// Set packed 64-bit integers in dst with the supplied values in reverse order.
5004// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
5005FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5006{
5007 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5008}
5009
5010// Set packed 8-bit integers in dst with the supplied values in reverse order.
5011// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
5012FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5013 signed char b1,
5014 signed char b2,
5015 signed char b3,
5016 signed char b4,
5017 signed char b5,
5018 signed char b6,
5019 signed char b7,
5020 signed char b8,
5021 signed char b9,
5022 signed char b10,
5023 signed char b11,
5024 signed char b12,
5025 signed char b13,
5026 signed char b14,
5027 signed char b15)
5028{
5029 int8_t ALIGN_STRUCT(16)
5030 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5031 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5032 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5033 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5034 return (__m128i) vld1q_s8(data);
5035}
5036
5037// Set packed double-precision (64-bit) floating-point elements in dst with the
5038// supplied values in reverse order.
5039// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5040FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5041{
5042 return _mm_set_pd(e0, e1);
5043}
5044
5045// Return vector of type __m128d with all elements set to zero.
5046// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5047FORCE_INLINE __m128d _mm_setzero_pd(void)
5048{
5049#if defined(__aarch64__) || defined(_M_ARM64)
5050 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5051#else
5052 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5053#endif
5054}
5055
5056// Return vector of type __m128i with all elements set to zero.
5057// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5058FORCE_INLINE __m128i _mm_setzero_si128(void)
5059{
5060 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5061}
5062
5063// Shuffle 32-bit integers in a using the control in imm8, and store the results
5064// in dst.
5065// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5066// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5067// __constrange(0,255) int imm)
5068#if defined(_sse2neon_shuffle)
5069#define _mm_shuffle_epi32(a, imm) \
5070 __extension__({ \
5071 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5072 int32x4_t _shuf = \
5073 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5074 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5075 vreinterpretq_m128i_s32(_shuf); \
5076 })
5077#else // generic
5078#define _mm_shuffle_epi32(a, imm) \
5079 _sse2neon_define1( \
5080 __m128i, a, __m128i ret; switch (imm) { \
5081 case _MM_SHUFFLE(1, 0, 3, 2): \
5082 ret = _mm_shuffle_epi_1032(_a); \
5083 break; \
5084 case _MM_SHUFFLE(2, 3, 0, 1): \
5085 ret = _mm_shuffle_epi_2301(_a); \
5086 break; \
5087 case _MM_SHUFFLE(0, 3, 2, 1): \
5088 ret = _mm_shuffle_epi_0321(_a); \
5089 break; \
5090 case _MM_SHUFFLE(2, 1, 0, 3): \
5091 ret = _mm_shuffle_epi_2103(_a); \
5092 break; \
5093 case _MM_SHUFFLE(1, 0, 1, 0): \
5094 ret = _mm_shuffle_epi_1010(_a); \
5095 break; \
5096 case _MM_SHUFFLE(1, 0, 0, 1): \
5097 ret = _mm_shuffle_epi_1001(_a); \
5098 break; \
5099 case _MM_SHUFFLE(0, 1, 0, 1): \
5100 ret = _mm_shuffle_epi_0101(_a); \
5101 break; \
5102 case _MM_SHUFFLE(2, 2, 1, 1): \
5103 ret = _mm_shuffle_epi_2211(_a); \
5104 break; \
5105 case _MM_SHUFFLE(0, 1, 2, 2): \
5106 ret = _mm_shuffle_epi_0122(_a); \
5107 break; \
5108 case _MM_SHUFFLE(3, 3, 3, 2): \
5109 ret = _mm_shuffle_epi_3332(_a); \
5110 break; \
5111 case _MM_SHUFFLE(0, 0, 0, 0): \
5112 ret = _mm_shuffle_epi32_splat(_a, 0); \
5113 break; \
5114 case _MM_SHUFFLE(1, 1, 1, 1): \
5115 ret = _mm_shuffle_epi32_splat(_a, 1); \
5116 break; \
5117 case _MM_SHUFFLE(2, 2, 2, 2): \
5118 ret = _mm_shuffle_epi32_splat(_a, 2); \
5119 break; \
5120 case _MM_SHUFFLE(3, 3, 3, 3): \
5121 ret = _mm_shuffle_epi32_splat(_a, 3); \
5122 break; \
5123 default: \
5124 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5125 break; \
5126 } _sse2neon_return(ret);)
5127#endif
5128
5129// Shuffle double-precision (64-bit) floating-point elements using the control
5130// in imm8, and store the results in dst.
5131// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5132#ifdef _sse2neon_shuffle
5133#define _mm_shuffle_pd(a, b, imm8) \
5134 vreinterpretq_m128d_s64( \
5135 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5136 imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5137#else
5138#define _mm_shuffle_pd(a, b, imm8) \
5139 _mm_castsi128_pd(_mm_set_epi64x( \
5140 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5141 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5142#endif
5143
5144// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5145// __constrange(0,255) int imm)
5146#if defined(_sse2neon_shuffle)
5147#define _mm_shufflehi_epi16(a, imm) \
5148 __extension__({ \
5149 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5150 int16x8_t _shuf = \
5151 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5152 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5153 (((imm) >> 6) & 0x3) + 4); \
5154 vreinterpretq_m128i_s16(_shuf); \
5155 })
5156#else // generic
5157#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5158#endif
5159
5160// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5161// __constrange(0,255) int imm)
5162#if defined(_sse2neon_shuffle)
5163#define _mm_shufflelo_epi16(a, imm) \
5164 __extension__({ \
5165 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5166 int16x8_t _shuf = vshuffleq_s16( \
5167 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5168 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5169 vreinterpretq_m128i_s16(_shuf); \
5170 })
5171#else // generic
5172#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5173#endif
5174
5175// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5176// store the results in dst.
5177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5178FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5179{
5180 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5181 if (_sse2neon_unlikely(c & ~15))
5182 return _mm_setzero_si128();
5183
5184 int16x8_t vc = vdupq_n_s16((int16_t) c);
5185 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5186}
5187
5188// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5189// store the results in dst.
5190// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5191FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5192{
5193 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5194 if (_sse2neon_unlikely(c & ~31))
5195 return _mm_setzero_si128();
5196
5197 int32x4_t vc = vdupq_n_s32((int32_t) c);
5198 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5199}
5200
5201// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5202// store the results in dst.
5203// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5204FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5205{
5206 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5207 if (_sse2neon_unlikely(c & ~63))
5208 return _mm_setzero_si128();
5209
5210 int64x2_t vc = vdupq_n_s64((int64_t) c);
5211 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5212}
5213
5214// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5215// store the results in dst.
5216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5217FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5218{
5219 if (_sse2neon_unlikely(imm & ~15))
5220 return _mm_setzero_si128();
5221 return vreinterpretq_m128i_s16(
5222 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5223}
5224
5225// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5226// store the results in dst.
5227// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5228FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5229{
5230 if (_sse2neon_unlikely(imm & ~31))
5231 return _mm_setzero_si128();
5232 return vreinterpretq_m128i_s32(
5233 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5234}
5235
5236// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5237// store the results in dst.
5238// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5239FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5240{
5241 if (_sse2neon_unlikely(imm & ~63))
5242 return _mm_setzero_si128();
5243 return vreinterpretq_m128i_s64(
5244 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5245}
5246
5247// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5248// dst.
5249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5250#define _mm_slli_si128(a, imm) \
5251 _sse2neon_define1( \
5252 __m128i, a, int8x16_t ret; \
5253 if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5254 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5255 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5256 ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5257 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5258
5259// Compute the square root of packed double-precision (64-bit) floating-point
5260// elements in a, and store the results in dst.
5261// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5262FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5263{
5264#if defined(__aarch64__) || defined(_M_ARM64)
5265 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5266#else
5267 double a0 = sqrt(((double *) &a)[0]);
5268 double a1 = sqrt(((double *) &a)[1]);
5269 return _mm_set_pd(a1, a0);
5270#endif
5271}
5272
5273// Compute the square root of the lower double-precision (64-bit) floating-point
5274// element in b, store the result in the lower element of dst, and copy the
5275// upper element from a to the upper element of dst.
5276// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5277FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5278{
5279#if defined(__aarch64__) || defined(_M_ARM64)
5280 return _mm_move_sd(a, _mm_sqrt_pd(b));
5281#else
5282 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5283#endif
5284}
5285
5286// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5287// and store the results in dst.
5288// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5289FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5290{
5291 int64_t c = vgetq_lane_s64(count, 0);
5292 if (_sse2neon_unlikely(c & ~15))
5293 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5294 return vreinterpretq_m128i_s16(
5295 vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
5296}
5297
5298// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5299// and store the results in dst.
5300// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5301FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5302{
5303 int64_t c = vgetq_lane_s64(count, 0);
5304 if (_sse2neon_unlikely(c & ~31))
5305 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5306 return vreinterpretq_m128i_s32(
5307 vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5308}
5309
5310// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5311// bits, and store the results in dst.
5312// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5313FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5314{
5315 const int count = (imm & ~15) ? 15 : imm;
5316 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5317}
5318
5319// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5320// and store the results in dst.
5321// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5322// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5323#define _mm_srai_epi32(a, imm) \
5324 _sse2neon_define0( \
5325 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5326 ret = _a; \
5327 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5328 ret = vreinterpretq_m128i_s32( \
5329 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5330 } else { \
5331 ret = vreinterpretq_m128i_s32( \
5332 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5333 } _sse2neon_return(ret);)
5334
5335// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5336// store the results in dst.
5337// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5338FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5339{
5340 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5341 if (_sse2neon_unlikely(c & ~15))
5342 return _mm_setzero_si128();
5343
5344 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5345 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5346}
5347
5348// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5349// store the results in dst.
5350// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5351FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5352{
5353 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5354 if (_sse2neon_unlikely(c & ~31))
5355 return _mm_setzero_si128();
5356
5357 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5358 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5359}
5360
5361// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5362// store the results in dst.
5363// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5364FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5365{
5366 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5367 if (_sse2neon_unlikely(c & ~63))
5368 return _mm_setzero_si128();
5369
5370 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5371 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5372}
5373
5374// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5375// store the results in dst.
5376// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5377#define _mm_srli_epi16(a, imm) \
5378 _sse2neon_define0( \
5379 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5380 ret = _mm_setzero_si128(); \
5381 } else { \
5382 ret = vreinterpretq_m128i_u16( \
5383 vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5384 } _sse2neon_return(ret);)
5385
5386// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5387// store the results in dst.
5388// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5389// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5390#define _mm_srli_epi32(a, imm) \
5391 _sse2neon_define0( \
5392 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5393 ret = _mm_setzero_si128(); \
5394 } else { \
5395 ret = vreinterpretq_m128i_u32( \
5396 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5397 } _sse2neon_return(ret);)
5398
5399// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5400// store the results in dst.
5401// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5402#define _mm_srli_epi64(a, imm) \
5403 _sse2neon_define0( \
5404 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5405 ret = _mm_setzero_si128(); \
5406 } else { \
5407 ret = vreinterpretq_m128i_u64( \
5408 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5409 } _sse2neon_return(ret);)
5410
5411// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5412// dst.
5413// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5414#define _mm_srli_si128(a, imm) \
5415 _sse2neon_define1( \
5416 __m128i, a, int8x16_t ret; \
5417 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5418 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5419 (imm > 15 ? 0 : imm)); \
5420 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5421
5422// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5423// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5424// or a general-protection exception may be generated.
5425// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5426FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5427{
5428#if defined(__aarch64__) || defined(_M_ARM64)
5429 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5430#else
5431 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5432#endif
5433}
5434
5435// Store the lower double-precision (64-bit) floating-point element from a into
5436// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5437// boundary or a general-protection exception may be generated.
5438// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5439FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5440{
5441#if defined(__aarch64__) || defined(_M_ARM64)
5442 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5443 vst1q_f64((float64_t *) mem_addr,
5444 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5445#else
5446 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5447 vst1q_f32((float32_t *) mem_addr,
5448 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5449#endif
5450}
5451
5452// Store the lower double-precision (64-bit) floating-point element from a into
5453// memory. mem_addr does not need to be aligned on any particular boundary.
5454// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5455FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5456{
5457#if defined(__aarch64__) || defined(_M_ARM64)
5458 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5459#else
5460 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5461#endif
5462}
5463
5464// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5465// on a 16-byte boundary or a general-protection exception may be generated.
5466// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5467FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5468{
5469 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5470}
5471
5472// Store the lower double-precision (64-bit) floating-point element from a into
5473// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5474// boundary or a general-protection exception may be generated.
5475// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5476#define _mm_store1_pd _mm_store_pd1
5477
5478// Store the upper double-precision (64-bit) floating-point element from a into
5479// memory.
5480// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5481FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5482{
5483#if defined(__aarch64__) || defined(_M_ARM64)
5484 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5485#else
5486 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5487#endif
5488}
5489
5490// Store 64-bit integer from the first element of a into memory.
5491// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5492FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5493{
5494 vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5495}
5496
5497// Store the lower double-precision (64-bit) floating-point element from a into
5498// memory.
5499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5500FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5501{
5502#if defined(__aarch64__) || defined(_M_ARM64)
5503 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5504#else
5505 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5506#endif
5507}
5508
5509// Store 2 double-precision (64-bit) floating-point elements from a into memory
5510// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5511// general-protection exception may be generated.
5512// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5513FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5514{
5515 float32x4_t f = vreinterpretq_f32_m128d(a);
5516 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5517}
5518
5519// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5520// elements) from a into memory. mem_addr does not need to be aligned on any
5521// particular boundary.
5522// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5523FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5524{
5525 _mm_store_pd(mem_addr, a);
5526}
5527
5528// Store 128-bits of integer data from a into memory. mem_addr does not need to
5529// be aligned on any particular boundary.
5530// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5531FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5532{
5533 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5534}
5535
5536// Store 32-bit integer from the first element of a into memory. mem_addr does
5537// not need to be aligned on any particular boundary.
5538// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5539FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5540{
5541 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5542}
5543
5544// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5545// elements) from a into memory using a non-temporal memory hint. mem_addr must
5546// be aligned on a 16-byte boundary or a general-protection exception may be
5547// generated.
5548// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5549FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5550{
5551#if __has_builtin(__builtin_nontemporal_store)
5552 __builtin_nontemporal_store(a, (__m128d *) p);
5553#elif defined(__aarch64__) || defined(_M_ARM64)
5554 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5555#else
5556 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5557#endif
5558}
5559
5560// Store 128-bits of integer data from a into memory using a non-temporal memory
5561// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5562// exception may be generated.
5563// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5564FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5565{
5566#if __has_builtin(__builtin_nontemporal_store)
5567 __builtin_nontemporal_store(a, p);
5568#else
5569 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5570#endif
5571}
5572
5573// Store 32-bit integer a into memory using a non-temporal hint to minimize
5574// cache pollution. If the cache line containing address mem_addr is already in
5575// the cache, the cache will be updated.
5576// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5577FORCE_INLINE void _mm_stream_si32(int *p, int a)
5578{
5579 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5580}
5581
5582// Store 64-bit integer a into memory using a non-temporal hint to minimize
5583// cache pollution. If the cache line containing address mem_addr is already in
5584// the cache, the cache will be updated.
5585// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5586FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5587{
5588 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5589}
5590
5591// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5592// store the results in dst.
5593// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5594FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5595{
5596 return vreinterpretq_m128i_s16(
5597 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5598}
5599
5600// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5601// store the results in dst.
5602// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5603FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5604{
5605 return vreinterpretq_m128i_s32(
5606 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5607}
5608
5609// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5610// store the results in dst.
5611// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5612FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5613{
5614 return vreinterpretq_m128i_s64(
5615 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5616}
5617
5618// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5619// store the results in dst.
5620// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5621FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5622{
5623 return vreinterpretq_m128i_s8(
5624 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5625}
5626
5627// Subtract packed double-precision (64-bit) floating-point elements in b from
5628// packed double-precision (64-bit) floating-point elements in a, and store the
5629// results in dst.
5630// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5631FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5632{
5633#if defined(__aarch64__) || defined(_M_ARM64)
5634 return vreinterpretq_m128d_f64(
5635 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5636#else
5637 double *da = (double *) &a;
5638 double *db = (double *) &b;
5639 double c[2];
5640 c[0] = da[0] - db[0];
5641 c[1] = da[1] - db[1];
5642 return vld1q_f32((float32_t *) c);
5643#endif
5644}
5645
5646// Subtract the lower double-precision (64-bit) floating-point element in b from
5647// the lower double-precision (64-bit) floating-point element in a, store the
5648// result in the lower element of dst, and copy the upper element from a to the
5649// upper element of dst.
5650// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5651FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5652{
5653 return _mm_move_sd(a, _mm_sub_pd(a, b));
5654}
5655
5656// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5657// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5658FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5659{
5660 return vreinterpret_m64_s64(
5661 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5662}
5663
5664// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5665// using saturation, and store the results in dst.
5666// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5667FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5668{
5669 return vreinterpretq_m128i_s16(
5670 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5671}
5672
5673// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5674// using saturation, and store the results in dst.
5675// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5676FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5677{
5678 return vreinterpretq_m128i_s8(
5679 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5680}
5681
5682// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5683// integers in a using saturation, and store the results in dst.
5684// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5685FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5686{
5687 return vreinterpretq_m128i_u16(
5688 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5689}
5690
5691// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5692// integers in a using saturation, and store the results in dst.
5693// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5694FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5695{
5696 return vreinterpretq_m128i_u8(
5697 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5698}
5699
5700#define _mm_ucomieq_sd _mm_comieq_sd
5701#define _mm_ucomige_sd _mm_comige_sd
5702#define _mm_ucomigt_sd _mm_comigt_sd
5703#define _mm_ucomile_sd _mm_comile_sd
5704#define _mm_ucomilt_sd _mm_comilt_sd
5705#define _mm_ucomineq_sd _mm_comineq_sd
5706
5707// Return vector of type __m128d with undefined elements.
5708// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5709FORCE_INLINE __m128d _mm_undefined_pd(void)
5710{
5711#if defined(__GNUC__) || defined(__clang__)
5712#pragma GCC diagnostic push
5713#pragma GCC diagnostic ignored "-Wuninitialized"
5714#endif
5715 __m128d a;
5716#if defined(_MSC_VER)
5717 a = _mm_setzero_pd();
5718#endif
5719 return a;
5720#if defined(__GNUC__) || defined(__clang__)
5721#pragma GCC diagnostic pop
5722#endif
5723}
5724
5725// Unpack and interleave 16-bit integers from the high half of a and b, and
5726// store the results in dst.
5727// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5728FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5729{
5730#if defined(__aarch64__) || defined(_M_ARM64)
5731 return vreinterpretq_m128i_s16(
5732 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5733#else
5734 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5735 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5736 int16x4x2_t result = vzip_s16(a1, b1);
5737 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5738#endif
5739}
5740
5741// Unpack and interleave 32-bit integers from the high half of a and b, and
5742// store the results in dst.
5743// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5744FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5745{
5746#if defined(__aarch64__) || defined(_M_ARM64)
5747 return vreinterpretq_m128i_s32(
5748 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5749#else
5750 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5751 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5752 int32x2x2_t result = vzip_s32(a1, b1);
5753 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5754#endif
5755}
5756
5757// Unpack and interleave 64-bit integers from the high half of a and b, and
5758// store the results in dst.
5759// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5760FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5761{
5762#if defined(__aarch64__) || defined(_M_ARM64)
5763 return vreinterpretq_m128i_s64(
5764 vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5765#else
5766 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5767 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5768 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5769#endif
5770}
5771
5772// Unpack and interleave 8-bit integers from the high half of a and b, and store
5773// the results in dst.
5774// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5775FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5776{
5777#if defined(__aarch64__) || defined(_M_ARM64)
5778 return vreinterpretq_m128i_s8(
5779 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5780#else
5781 int8x8_t a1 =
5782 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5783 int8x8_t b1 =
5784 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5785 int8x8x2_t result = vzip_s8(a1, b1);
5786 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5787#endif
5788}
5789
5790// Unpack and interleave double-precision (64-bit) floating-point elements from
5791// the high half of a and b, and store the results in dst.
5792// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5793FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
5794{
5795#if defined(__aarch64__) || defined(_M_ARM64)
5796 return vreinterpretq_m128d_f64(
5797 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5798#else
5799 return vreinterpretq_m128d_s64(
5800 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5801 vget_high_s64(vreinterpretq_s64_m128d(b))));
5802#endif
5803}
5804
5805// Unpack and interleave 16-bit integers from the low half of a and b, and store
5806// the results in dst.
5807// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5808FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5809{
5810#if defined(__aarch64__) || defined(_M_ARM64)
5811 return vreinterpretq_m128i_s16(
5812 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5813#else
5814 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5815 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5816 int16x4x2_t result = vzip_s16(a1, b1);
5817 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5818#endif
5819}
5820
5821// Unpack and interleave 32-bit integers from the low half of a and b, and store
5822// the results in dst.
5823// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5824FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5825{
5826#if defined(__aarch64__) || defined(_M_ARM64)
5827 return vreinterpretq_m128i_s32(
5828 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5829#else
5830 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5831 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5832 int32x2x2_t result = vzip_s32(a1, b1);
5833 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5834#endif
5835}
5836
5837// Unpack and interleave 64-bit integers from the low half of a and b, and store
5838// the results in dst.
5839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5840FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5841{
5842#if defined(__aarch64__) || defined(_M_ARM64)
5843 return vreinterpretq_m128i_s64(
5844 vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5845#else
5846 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5847 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5848 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5849#endif
5850}
5851
5852// Unpack and interleave 8-bit integers from the low half of a and b, and store
5853// the results in dst.
5854// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5855FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5856{
5857#if defined(__aarch64__) || defined(_M_ARM64)
5858 return vreinterpretq_m128i_s8(
5859 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5860#else
5861 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5862 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5863 int8x8x2_t result = vzip_s8(a1, b1);
5864 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5865#endif
5866}
5867
5868// Unpack and interleave double-precision (64-bit) floating-point elements from
5869// the low half of a and b, and store the results in dst.
5870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5871FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
5872{
5873#if defined(__aarch64__) || defined(_M_ARM64)
5874 return vreinterpretq_m128d_f64(
5875 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5876#else
5877 return vreinterpretq_m128d_s64(
5878 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5879 vget_low_s64(vreinterpretq_s64_m128d(b))));
5880#endif
5881}
5882
5883// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
5884// elements in a and b, and store the results in dst.
5885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
5886FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
5887{
5888 return vreinterpretq_m128d_s64(
5889 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
5890}
5891
5892// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
5893// and store the result in dst.
5894// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
5895FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
5896{
5897 return vreinterpretq_m128i_s32(
5898 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5899}
5900
5901/* SSE3 */
5902
5903// Alternatively add and subtract packed double-precision (64-bit)
5904// floating-point elements in a to/from packed elements in b, and store the
5905// results in dst.
5906// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
5907FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
5908{
5909 _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
5910#if defined(__aarch64__) || defined(_M_ARM64)
5911 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5912 vreinterpretq_f64_m128d(b),
5913 vreinterpretq_f64_m128d(mask)));
5914#else
5915 return _mm_add_pd(_mm_mul_pd(b, mask), a);
5916#endif
5917}
5918
5919// Alternatively add and subtract packed single-precision (32-bit)
5920// floating-point elements in a to/from packed elements in b, and store the
5921// results in dst.
5922// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
5923FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
5924{
5925 _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
5926#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5927 defined(__ARM_FEATURE_FMA) /* VFPv4+ */
5928 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
5929 vreinterpretq_f32_m128(mask),
5930 vreinterpretq_f32_m128(b)));
5931#else
5932 return _mm_add_ps(_mm_mul_ps(b, mask), a);
5933#endif
5934}
5935
5936// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
5937// elements in a and b, and pack the results in dst.
5938// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
5939FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
5940{
5941#if defined(__aarch64__) || defined(_M_ARM64)
5942 return vreinterpretq_m128d_f64(
5943 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5944#else
5945 double *da = (double *) &a;
5946 double *db = (double *) &b;
5947 double c[] = {da[0] + da[1], db[0] + db[1]};
5948 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5949#endif
5950}
5951
5952// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
5953// elements in a and b, and pack the results in dst.
5954// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
5955FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
5956{
5957#if defined(__aarch64__) || defined(_M_ARM64)
5958 return vreinterpretq_m128_f32(
5959 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5960#else
5961 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
5962 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
5963 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
5964 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
5965 return vreinterpretq_m128_f32(
5966 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5967#endif
5968}
5969
5970// Horizontally subtract adjacent pairs of double-precision (64-bit)
5971// floating-point elements in a and b, and pack the results in dst.
5972// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
5973FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
5974{
5975#if defined(__aarch64__) || defined(_M_ARM64)
5976 float64x2_t a = vreinterpretq_f64_m128d(_a);
5977 float64x2_t b = vreinterpretq_f64_m128d(_b);
5978 return vreinterpretq_m128d_f64(
5979 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5980#else
5981 double *da = (double *) &_a;
5982 double *db = (double *) &_b;
5983 double c[] = {da[0] - da[1], db[0] - db[1]};
5984 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5985#endif
5986}
5987
5988// Horizontally subtract adjacent pairs of single-precision (32-bit)
5989// floating-point elements in a and b, and pack the results in dst.
5990// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
5991FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
5992{
5993 float32x4_t a = vreinterpretq_f32_m128(_a);
5994 float32x4_t b = vreinterpretq_f32_m128(_b);
5995#if defined(__aarch64__) || defined(_M_ARM64)
5996 return vreinterpretq_m128_f32(
5997 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5998#else
5999 float32x4x2_t c = vuzpq_f32(a, b);
6000 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6001#endif
6002}
6003
6004// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6005// may perform better than _mm_loadu_si128 when the data crosses a cache line
6006// boundary.
6007// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
6008#define _mm_lddqu_si128 _mm_loadu_si128
6009
6010// Load a double-precision (64-bit) floating-point element from memory into both
6011// elements of dst.
6012// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
6013#define _mm_loaddup_pd _mm_load1_pd
6014
6015// Duplicate the low double-precision (64-bit) floating-point element from a,
6016// and store the results in dst.
6017// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6018FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6019{
6020#if defined(__aarch64__) || defined(_M_ARM64)
6021 return vreinterpretq_m128d_f64(
6022 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6023#else
6024 return vreinterpretq_m128d_u64(
6025 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6026#endif
6027}
6028
6029// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6030// from a, and store the results in dst.
6031// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6032FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6033{
6034#if defined(__aarch64__) || defined(_M_ARM64)
6035 return vreinterpretq_m128_f32(
6036 vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6037#elif defined(_sse2neon_shuffle)
6038 return vreinterpretq_m128_f32(vshuffleq_s32(
6039 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6040#else
6041 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6042 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6043 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6044 return vreinterpretq_m128_f32(vld1q_f32(data));
6045#endif
6046}
6047
6048// Duplicate even-indexed single-precision (32-bit) floating-point elements
6049// from a, and store the results in dst.
6050// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6051FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6052{
6053#if defined(__aarch64__) || defined(_M_ARM64)
6054 return vreinterpretq_m128_f32(
6055 vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6056#elif defined(_sse2neon_shuffle)
6057 return vreinterpretq_m128_f32(vshuffleq_s32(
6058 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6059#else
6060 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6061 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6062 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6063 return vreinterpretq_m128_f32(vld1q_f32(data));
6064#endif
6065}
6066
6067/* SSSE3 */
6068
6069// Compute the absolute value of packed signed 16-bit integers in a, and store
6070// the unsigned results in dst.
6071// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6072FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6073{
6074 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6075}
6076
6077// Compute the absolute value of packed signed 32-bit integers in a, and store
6078// the unsigned results in dst.
6079// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6080FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6081{
6082 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6083}
6084
6085// Compute the absolute value of packed signed 8-bit integers in a, and store
6086// the unsigned results in dst.
6087// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6088FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6089{
6090 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6091}
6092
6093// Compute the absolute value of packed signed 16-bit integers in a, and store
6094// the unsigned results in dst.
6095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6096FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6097{
6098 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6099}
6100
6101// Compute the absolute value of packed signed 32-bit integers in a, and store
6102// the unsigned results in dst.
6103// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6104FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6105{
6106 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6107}
6108
6109// Compute the absolute value of packed signed 8-bit integers in a, and store
6110// the unsigned results in dst.
6111// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6112FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6113{
6114 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6115}
6116
6117// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6118// the result right by imm8 bytes, and store the low 16 bytes in dst.
6119// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6120#if defined(__GNUC__) && !defined(__clang__)
6121#define _mm_alignr_epi8(a, b, imm) \
6122 __extension__({ \
6123 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6124 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6125 __m128i ret; \
6126 if (_sse2neon_unlikely((imm) & ~31)) \
6127 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6128 else if (imm >= 16) \
6129 ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6130 else \
6131 ret = \
6132 vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6133 ret; \
6134 })
6135
6136#else
6137#define _mm_alignr_epi8(a, b, imm) \
6138 _sse2neon_define2( \
6139 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6140 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6141 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6142 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6143 else if (imm >= 16) ret = \
6144 _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
6145 else ret = \
6146 vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6147 _sse2neon_return(ret);)
6148
6149#endif
6150
6151// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6152// the result right by imm8 bytes, and store the low 8 bytes in dst.
6153// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6154#define _mm_alignr_pi8(a, b, imm) \
6155 _sse2neon_define2( \
6156 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6157 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6158 } else { \
6159 uint8x8_t tmp_low; \
6160 uint8x8_t tmp_high; \
6161 if ((imm) >= 8) { \
6162 const int idx = (imm) -8; \
6163 tmp_low = vreinterpret_u8_m64(_a); \
6164 tmp_high = vdup_n_u8(0); \
6165 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6166 } else { \
6167 const int idx = (imm); \
6168 tmp_low = vreinterpret_u8_m64(_b); \
6169 tmp_high = vreinterpret_u8_m64(_a); \
6170 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6171 } \
6172 } _sse2neon_return(ret);)
6173
6174// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6175// signed 16-bit results in dst.
6176// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6177FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6178{
6179 int16x8_t a = vreinterpretq_s16_m128i(_a);
6180 int16x8_t b = vreinterpretq_s16_m128i(_b);
6181#if defined(__aarch64__) || defined(_M_ARM64)
6182 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6183#else
6184 return vreinterpretq_m128i_s16(
6185 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6186 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6187#endif
6188}
6189
6190// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6191// signed 32-bit results in dst.
6192// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6193FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6194{
6195 int32x4_t a = vreinterpretq_s32_m128i(_a);
6196 int32x4_t b = vreinterpretq_s32_m128i(_b);
6197#if defined(__aarch64__) || defined(_M_ARM64)
6198 return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6199#else
6200 return vreinterpretq_m128i_s32(
6201 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6202 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6203#endif
6204}
6205
6206// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6207// signed 16-bit results in dst.
6208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6209FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6210{
6211 return vreinterpret_m64_s16(
6212 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6213}
6214
6215// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6216// signed 32-bit results in dst.
6217// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6218FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6219{
6220 return vreinterpret_m64_s32(
6221 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6222}
6223
6224// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6225// saturation, and pack the signed 16-bit results in dst.
6226// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6227FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6228{
6229#if defined(__aarch64__) || defined(_M_ARM64)
6230 int16x8_t a = vreinterpretq_s16_m128i(_a);
6231 int16x8_t b = vreinterpretq_s16_m128i(_b);
6232 return vreinterpretq_s64_s16(
6233 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6234#else
6235 int32x4_t a = vreinterpretq_s32_m128i(_a);
6236 int32x4_t b = vreinterpretq_s32_m128i(_b);
6237 // Interleave using vshrn/vmovn
6238 // [a0|a2|a4|a6|b0|b2|b4|b6]
6239 // [a1|a3|a5|a7|b1|b3|b5|b7]
6240 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6241 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6242 // Saturated add
6243 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6244#endif
6245}
6246
6247// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6248// saturation, and pack the signed 16-bit results in dst.
6249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6250FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6251{
6252 int16x4_t a = vreinterpret_s16_m64(_a);
6253 int16x4_t b = vreinterpret_s16_m64(_b);
6254#if defined(__aarch64__) || defined(_M_ARM64)
6255 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6256#else
6257 int16x4x2_t res = vuzp_s16(a, b);
6258 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6259#endif
6260}
6261
6262// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6263// the signed 16-bit results in dst.
6264// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6265FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6266{
6267 int16x8_t a = vreinterpretq_s16_m128i(_a);
6268 int16x8_t b = vreinterpretq_s16_m128i(_b);
6269#if defined(__aarch64__) || defined(_M_ARM64)
6270 return vreinterpretq_m128i_s16(
6271 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6272#else
6273 int16x8x2_t c = vuzpq_s16(a, b);
6274 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6275#endif
6276}
6277
6278// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6279// the signed 32-bit results in dst.
6280// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6281FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6282{
6283 int32x4_t a = vreinterpretq_s32_m128i(_a);
6284 int32x4_t b = vreinterpretq_s32_m128i(_b);
6285#if defined(__aarch64__) || defined(_M_ARM64)
6286 return vreinterpretq_m128i_s32(
6287 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6288#else
6289 int32x4x2_t c = vuzpq_s32(a, b);
6290 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6291#endif
6292}
6293
6294// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6295// the signed 16-bit results in dst.
6296// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6297FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6298{
6299 int16x4_t a = vreinterpret_s16_m64(_a);
6300 int16x4_t b = vreinterpret_s16_m64(_b);
6301#if defined(__aarch64__) || defined(_M_ARM64)
6302 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6303#else
6304 int16x4x2_t c = vuzp_s16(a, b);
6305 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6306#endif
6307}
6308
6309// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6310// the signed 32-bit results in dst.
6311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6312FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6313{
6314 int32x2_t a = vreinterpret_s32_m64(_a);
6315 int32x2_t b = vreinterpret_s32_m64(_b);
6316#if defined(__aarch64__) || defined(_M_ARM64)
6317 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6318#else
6319 int32x2x2_t c = vuzp_s32(a, b);
6320 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6321#endif
6322}
6323
6324// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6325// using saturation, and pack the signed 16-bit results in dst.
6326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6327FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6328{
6329 int16x8_t a = vreinterpretq_s16_m128i(_a);
6330 int16x8_t b = vreinterpretq_s16_m128i(_b);
6331#if defined(__aarch64__) || defined(_M_ARM64)
6332 return vreinterpretq_m128i_s16(
6333 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6334#else
6335 int16x8x2_t c = vuzpq_s16(a, b);
6336 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6337#endif
6338}
6339
6340// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6341// using saturation, and pack the signed 16-bit results in dst.
6342// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6343FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6344{
6345 int16x4_t a = vreinterpret_s16_m64(_a);
6346 int16x4_t b = vreinterpret_s16_m64(_b);
6347#if defined(__aarch64__) || defined(_M_ARM64)
6348 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6349#else
6350 int16x4x2_t c = vuzp_s16(a, b);
6351 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6352#endif
6353}
6354
6355// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6356// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6357// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6358// and pack the saturated results in dst.
6359// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6360FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6361{
6362#if defined(__aarch64__) || defined(_M_ARM64)
6363 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6364 int8x16_t b = vreinterpretq_s8_m128i(_b);
6365 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6366 vmovl_s8(vget_low_s8(b)));
6367 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6368 vmovl_s8(vget_high_s8(b)));
6369 return vreinterpretq_m128i_s16(
6370 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6371#else
6372 // This would be much simpler if x86 would choose to zero extend OR sign
6373 // extend, not both. This could probably be optimized better.
6374 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6375 int16x8_t b = vreinterpretq_s16_m128i(_b);
6376
6377 // Zero extend a
6378 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6379 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6380
6381 // Sign extend by shifting left then shifting right.
6382 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6383 int16x8_t b_odd = vshrq_n_s16(b, 8);
6384
6385 // multiply
6386 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6387 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6388
6389 // saturated add
6390 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6391#endif
6392}
6393
6394// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6395// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6396// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6397// pack the saturated results in dst.
6398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6399FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6400{
6401 uint16x4_t a = vreinterpret_u16_m64(_a);
6402 int16x4_t b = vreinterpret_s16_m64(_b);
6403
6404 // Zero extend a
6405 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6406 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6407
6408 // Sign extend by shifting left then shifting right.
6409 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6410 int16x4_t b_odd = vshr_n_s16(b, 8);
6411
6412 // multiply
6413 int16x4_t prod1 = vmul_s16(a_even, b_even);
6414 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6415
6416 // saturated add
6417 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6418}
6419
6420// Multiply packed signed 16-bit integers in a and b, producing intermediate
6421// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6422// the packed 16-bit integers in dst.
6423// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6424FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6425{
6426 // Has issues due to saturation
6427 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6428
6429 // Multiply
6430 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6431 vget_low_s16(vreinterpretq_s16_m128i(b)));
6432 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6433 vget_high_s16(vreinterpretq_s16_m128i(b)));
6434
6435 // Rounding narrowing shift right
6436 // narrow = (int16_t)((mul + 16384) >> 15);
6437 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6438 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6439
6440 // Join together
6441 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6442}
6443
6444// Multiply packed signed 16-bit integers in a and b, producing intermediate
6445// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6446// significant bits, round by adding 1, and store bits [16:1] to dst.
6447// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6448FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6449{
6450 int32x4_t mul_extend =
6451 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6452
6453 // Rounding narrowing shift right
6454 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6455}
6456
6457// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6458// corresponding 8-bit element of b, and store the results in dst.
6459// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6460FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6461{
6462 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6463 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6464 uint8x16_t idx_masked =
6465 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6466#if defined(__aarch64__) || defined(_M_ARM64)
6467 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6468#elif defined(__GNUC__)
6469 int8x16_t ret;
6470 // %e and %f represent the even and odd D registers
6471 // respectively.
6472 __asm__ __volatile__(
6473 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6474 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6475 : [ret] "=&w"(ret)
6476 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6477 return vreinterpretq_m128i_s8(ret);
6478#else
6479 // use this line if testing on aarch64
6480 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6481 return vreinterpretq_m128i_s8(
6482 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6483 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6484#endif
6485}
6486
6487// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6488// corresponding 8-bit element of b, and store the results in dst.
6489// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6490FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
6491{
6492 const int8x8_t controlMask =
6493 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6494 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6495 return vreinterpret_m64_s8(res);
6496}
6497
6498// Negate packed 16-bit integers in a when the corresponding signed
6499// 16-bit integer in b is negative, and store the results in dst.
6500// Element in dst are zeroed out when the corresponding element
6501// in b is zero.
6502// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6503FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6504{
6505 int16x8_t a = vreinterpretq_s16_m128i(_a);
6506 int16x8_t b = vreinterpretq_s16_m128i(_b);
6507
6508 // signed shift right: faster than vclt
6509 // (b < 0) ? 0xFFFF : 0
6510 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6511 // (b == 0) ? 0xFFFF : 0
6512#if defined(__aarch64__) || defined(_M_ARM64)
6513 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6514#else
6515 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6516#endif
6517
6518 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6519 // 'a') based on ltMask
6520 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6521 // res = masked & (~zeroMask)
6522 int16x8_t res = vbicq_s16(masked, zeroMask);
6523 return vreinterpretq_m128i_s16(res);
6524}
6525
6526// Negate packed 32-bit integers in a when the corresponding signed
6527// 32-bit integer in b is negative, and store the results in dst.
6528// Element in dst are zeroed out when the corresponding element
6529// in b is zero.
6530// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6531FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6532{
6533 int32x4_t a = vreinterpretq_s32_m128i(_a);
6534 int32x4_t b = vreinterpretq_s32_m128i(_b);
6535
6536 // signed shift right: faster than vclt
6537 // (b < 0) ? 0xFFFFFFFF : 0
6538 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6539
6540 // (b == 0) ? 0xFFFFFFFF : 0
6541#if defined(__aarch64__) || defined(_M_ARM64)
6542 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6543#else
6544 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6545#endif
6546
6547 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6548 // 'a') based on ltMask
6549 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6550 // res = masked & (~zeroMask)
6551 int32x4_t res = vbicq_s32(masked, zeroMask);
6552 return vreinterpretq_m128i_s32(res);
6553}
6554
6555// Negate packed 8-bit integers in a when the corresponding signed
6556// 8-bit integer in b is negative, and store the results in dst.
6557// Element in dst are zeroed out when the corresponding element
6558// in b is zero.
6559// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6560FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6561{
6562 int8x16_t a = vreinterpretq_s8_m128i(_a);
6563 int8x16_t b = vreinterpretq_s8_m128i(_b);
6564
6565 // signed shift right: faster than vclt
6566 // (b < 0) ? 0xFF : 0
6567 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6568
6569 // (b == 0) ? 0xFF : 0
6570#if defined(__aarch64__) || defined(_M_ARM64)
6571 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6572#else
6573 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6574#endif
6575
6576 // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6577 // based on ltMask
6578 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6579 // res = masked & (~zeroMask)
6580 int8x16_t res = vbicq_s8(masked, zeroMask);
6581
6582 return vreinterpretq_m128i_s8(res);
6583}
6584
6585// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6586// integer in b is negative, and store the results in dst. Element in dst are
6587// zeroed out when the corresponding element in b is zero.
6588// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6589FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6590{
6591 int16x4_t a = vreinterpret_s16_m64(_a);
6592 int16x4_t b = vreinterpret_s16_m64(_b);
6593
6594 // signed shift right: faster than vclt
6595 // (b < 0) ? 0xFFFF : 0
6596 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6597
6598 // (b == 0) ? 0xFFFF : 0
6599#if defined(__aarch64__) || defined(_M_ARM64)
6600 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6601#else
6602 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6603#endif
6604
6605 // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6606 // based on ltMask
6607 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6608 // res = masked & (~zeroMask)
6609 int16x4_t res = vbic_s16(masked, zeroMask);
6610
6611 return vreinterpret_m64_s16(res);
6612}
6613
6614// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6615// integer in b is negative, and store the results in dst. Element in dst are
6616// zeroed out when the corresponding element in b is zero.
6617// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6618FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6619{
6620 int32x2_t a = vreinterpret_s32_m64(_a);
6621 int32x2_t b = vreinterpret_s32_m64(_b);
6622
6623 // signed shift right: faster than vclt
6624 // (b < 0) ? 0xFFFFFFFF : 0
6625 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6626
6627 // (b == 0) ? 0xFFFFFFFF : 0
6628#if defined(__aarch64__) || defined(_M_ARM64)
6629 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6630#else
6631 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6632#endif
6633
6634 // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6635 // based on ltMask
6636 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6637 // res = masked & (~zeroMask)
6638 int32x2_t res = vbic_s32(masked, zeroMask);
6639
6640 return vreinterpret_m64_s32(res);
6641}
6642
6643// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6644// in b is negative, and store the results in dst. Element in dst are zeroed out
6645// when the corresponding element in b is zero.
6646// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6647FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6648{
6649 int8x8_t a = vreinterpret_s8_m64(_a);
6650 int8x8_t b = vreinterpret_s8_m64(_b);
6651
6652 // signed shift right: faster than vclt
6653 // (b < 0) ? 0xFF : 0
6654 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6655
6656 // (b == 0) ? 0xFF : 0
6657#if defined(__aarch64__) || defined(_M_ARM64)
6658 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6659#else
6660 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6661#endif
6662
6663 // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6664 // based on ltMask
6665 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6666 // res = masked & (~zeroMask)
6667 int8x8_t res = vbic_s8(masked, zeroMask);
6668
6669 return vreinterpret_m64_s8(res);
6670}
6671
6672/* SSE4.1 */
6673
6674// Blend packed 16-bit integers from a and b using control mask imm8, and store
6675// the results in dst.
6676// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6677// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6678// __constrange(0,255) int imm)
6679#define _mm_blend_epi16(a, b, imm) \
6680 _sse2neon_define2( \
6681 __m128i, a, b, \
6682 const uint16_t _mask[8] = \
6683 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6684 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6685 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6686 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6687 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6688 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6689 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6690 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6691 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6692 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6693 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6694 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6695
6696// Blend packed double-precision (64-bit) floating-point elements from a and b
6697// using control mask imm8, and store the results in dst.
6698// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6699#define _mm_blend_pd(a, b, imm) \
6700 _sse2neon_define2( \
6701 __m128d, a, b, \
6702 const uint64_t _mask[2] = \
6703 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6704 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6705 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6706 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6707 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6708 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6709
6710// Blend packed single-precision (32-bit) floating-point elements from a and b
6711// using mask, and store the results in dst.
6712// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6713FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6714{
6715 const uint32_t ALIGN_STRUCT(16)
6716 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6717 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6718 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6719 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6720 uint32x4_t mask = vld1q_u32(data);
6721 float32x4_t a = vreinterpretq_f32_m128(_a);
6722 float32x4_t b = vreinterpretq_f32_m128(_b);
6723 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6724}
6725
6726// Blend packed 8-bit integers from a and b using mask, and store the results in
6727// dst.
6728// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6729FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6730{
6731 // Use a signed shift right to create a mask with the sign bit
6732 uint8x16_t mask =
6733 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6734 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6735 uint8x16_t b = vreinterpretq_u8_m128i(_b);
6736 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6737}
6738
6739// Blend packed double-precision (64-bit) floating-point elements from a and b
6740// using mask, and store the results in dst.
6741// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6742FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6743{
6744 uint64x2_t mask =
6745 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6746#if defined(__aarch64__) || defined(_M_ARM64)
6747 float64x2_t a = vreinterpretq_f64_m128d(_a);
6748 float64x2_t b = vreinterpretq_f64_m128d(_b);
6749 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6750#else
6751 uint64x2_t a = vreinterpretq_u64_m128d(_a);
6752 uint64x2_t b = vreinterpretq_u64_m128d(_b);
6753 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6754#endif
6755}
6756
6757// Blend packed single-precision (32-bit) floating-point elements from a and b
6758// using mask, and store the results in dst.
6759// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6760FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6761{
6762 // Use a signed shift right to create a mask with the sign bit
6763 uint32x4_t mask =
6764 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6765 float32x4_t a = vreinterpretq_f32_m128(_a);
6766 float32x4_t b = vreinterpretq_f32_m128(_b);
6767 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6768}
6769
6770// Round the packed double-precision (64-bit) floating-point elements in a up
6771// to an integer value, and store the results as packed double-precision
6772// floating-point elements in dst.
6773// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6774FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
6775{
6776#if defined(__aarch64__) || defined(_M_ARM64)
6777 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6778#else
6779 double *f = (double *) &a;
6780 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
6781#endif
6782}
6783
6784// Round the packed single-precision (32-bit) floating-point elements in a up to
6785// an integer value, and store the results as packed single-precision
6786// floating-point elements in dst.
6787// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6788FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
6789{
6790#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6791 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6792 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6793#else
6794 float *f = (float *) &a;
6795 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6796#endif
6797}
6798
6799// Round the lower double-precision (64-bit) floating-point element in b up to
6800// an integer value, store the result as a double-precision floating-point
6801// element in the lower element of dst, and copy the upper element from a to the
6802// upper element of dst.
6803// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6804FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
6805{
6806 return _mm_move_sd(a, _mm_ceil_pd(b));
6807}
6808
6809// Round the lower single-precision (32-bit) floating-point element in b up to
6810// an integer value, store the result as a single-precision floating-point
6811// element in the lower element of dst, and copy the upper 3 packed elements
6812// from a to the upper elements of dst.
6813// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6814FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
6815{
6816 return _mm_move_ss(a, _mm_ceil_ps(b));
6817}
6818
6819// Compare packed 64-bit integers in a and b for equality, and store the results
6820// in dst
6821FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
6822{
6823#if defined(__aarch64__) || defined(_M_ARM64)
6824 return vreinterpretq_m128i_u64(
6825 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
6826#else
6827 // ARMv7 lacks vceqq_u64
6828 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6829 uint32x4_t cmp =
6830 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
6831 uint32x4_t swapped = vrev64q_u32(cmp);
6832 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6833#endif
6834}
6835
6836// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6837// the results in dst.
6838// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6839FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
6840{
6841 return vreinterpretq_m128i_s32(
6842 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6843}
6844
6845// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6846// the results in dst.
6847// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6848FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
6849{
6850 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6851 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6852 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6853 return vreinterpretq_m128i_s64(s64x2);
6854}
6855
6856// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6857// the results in dst.
6858// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6859FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
6860{
6861 return vreinterpretq_m128i_s64(
6862 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6863}
6864
6865// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6866// the results in dst.
6867// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6868FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
6869{
6870 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6871 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6872 return vreinterpretq_m128i_s16(s16x8);
6873}
6874
6875// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
6876// the results in dst.
6877// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
6878FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
6879{
6880 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6881 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6882 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
6883 return vreinterpretq_m128i_s32(s32x4);
6884}
6885
6886// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
6887// integers, and store the results in dst.
6888// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
6889FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
6890{
6891 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
6892 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6893 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6894 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6895 return vreinterpretq_m128i_s64(s64x2);
6896}
6897
6898// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
6899// and store the results in dst.
6900// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
6901FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
6902{
6903 return vreinterpretq_m128i_u32(
6904 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
6905}
6906
6907// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
6908// and store the results in dst.
6909// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
6910FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
6911{
6912 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6913 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6914 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6915 return vreinterpretq_m128i_u64(u64x2);
6916}
6917
6918// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
6919// and store the results in dst.
6920// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
6921FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
6922{
6923 return vreinterpretq_m128i_u64(
6924 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
6925}
6926
6927// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
6928// and store the results in dst.
6929// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
6930FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
6931{
6932 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
6933 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
6934 return vreinterpretq_m128i_u16(u16x8);
6935}
6936
6937// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
6938// and store the results in dst.
6939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
6940FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
6941{
6942 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
6943 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6944 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
6945 return vreinterpretq_m128i_u32(u32x4);
6946}
6947
6948// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
6949// 64-bit integers, and store the results in dst.
6950// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
6951FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
6952{
6953 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
6954 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6955 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6956 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6957 return vreinterpretq_m128i_u64(u64x2);
6958}
6959
6960// Conditionally multiply the packed double-precision (64-bit) floating-point
6961// elements in a and b using the high 4 bits in imm8, sum the four products, and
6962// conditionally store the sum in dst using the low 4 bits of imm8.
6963// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
6964FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
6965{
6966 // Generate mask value from constant immediate bit value
6967 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6968 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6969#if !SSE2NEON_PRECISE_DP
6970 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6971 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6972#endif
6973 // Conditional multiplication
6974#if !SSE2NEON_PRECISE_DP
6975 __m128d mul = _mm_mul_pd(a, b);
6976 const __m128d mulMask =
6977 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
6978 __m128d tmp = _mm_and_pd(mul, mulMask);
6979#else
6980#if defined(__aarch64__) || defined(_M_ARM64)
6981 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6982 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6983 : 0;
6984 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6985 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6986 : 0;
6987#else
6988 double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
6989 double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
6990#endif
6991 __m128d tmp = _mm_set_pd(d1, d0);
6992#endif
6993 // Sum the products
6994#if defined(__aarch64__) || defined(_M_ARM64)
6995 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6996#else
6997 double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
6998#endif
6999 // Conditionally store the sum
7000 const __m128d sumMask =
7001 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
7002 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
7003 return res;
7004}
7005
7006// Conditionally multiply the packed single-precision (32-bit) floating-point
7007// elements in a and b using the high 4 bits in imm8, sum the four products,
7008// and conditionally store the sum in dst using the low 4 bits of imm.
7009// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
7010FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7011{
7012 float32x4_t elementwise_prod = _mm_mul_ps(a, b);
7013
7014#if defined(__aarch64__) || defined(_M_ARM64)
7015 /* shortcuts */
7016 if (imm == 0xFF) {
7017 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7018 }
7019
7020 if ((imm & 0x0F) == 0x0F) {
7021 if (!(imm & (1 << 4)))
7022 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7023 if (!(imm & (1 << 5)))
7024 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7025 if (!(imm & (1 << 6)))
7026 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7027 if (!(imm & (1 << 7)))
7028 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7029
7030 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7031 }
7032#endif
7033
7034 float s = 0.0f;
7035
7036 if (imm & (1 << 4))
7037 s += vgetq_lane_f32(elementwise_prod, 0);
7038 if (imm & (1 << 5))
7039 s += vgetq_lane_f32(elementwise_prod, 1);
7040 if (imm & (1 << 6))
7041 s += vgetq_lane_f32(elementwise_prod, 2);
7042 if (imm & (1 << 7))
7043 s += vgetq_lane_f32(elementwise_prod, 3);
7044
7045 const float32_t res[4] = {
7046 (imm & 0x1) ? s : 0.0f,
7047 (imm & 0x2) ? s : 0.0f,
7048 (imm & 0x4) ? s : 0.0f,
7049 (imm & 0x8) ? s : 0.0f,
7050 };
7051 return vreinterpretq_m128_f32(vld1q_f32(res));
7052}
7053
7054// Extract a 32-bit integer from a, selected with imm8, and store the result in
7055// dst.
7056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7057// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7058#define _mm_extract_epi32(a, imm) \
7059 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7060
7061// Extract a 64-bit integer from a, selected with imm8, and store the result in
7062// dst.
7063// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7064// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7065#define _mm_extract_epi64(a, imm) \
7066 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7067
7068// Extract an 8-bit integer from a, selected with imm8, and store the result in
7069// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7070// __constrange(0,16) int imm)
7071// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7072#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7073
7074// Extracts the selected single-precision (32-bit) floating-point from a.
7075// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7076#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7077
7078// Round the packed double-precision (64-bit) floating-point elements in a down
7079// to an integer value, and store the results as packed double-precision
7080// floating-point elements in dst.
7081// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7082FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7083{
7084#if defined(__aarch64__) || defined(_M_ARM64)
7085 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7086#else
7087 double *f = (double *) &a;
7088 return _mm_set_pd(floor(f[1]), floor(f[0]));
7089#endif
7090}
7091
7092// Round the packed single-precision (32-bit) floating-point elements in a down
7093// to an integer value, and store the results as packed single-precision
7094// floating-point elements in dst.
7095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7096FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7097{
7098#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7099 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7100 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7101#else
7102 float *f = (float *) &a;
7103 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7104#endif
7105}
7106
7107// Round the lower double-precision (64-bit) floating-point element in b down to
7108// an integer value, store the result as a double-precision floating-point
7109// element in the lower element of dst, and copy the upper element from a to the
7110// upper element of dst.
7111// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7112FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7113{
7114 return _mm_move_sd(a, _mm_floor_pd(b));
7115}
7116
7117// Round the lower single-precision (32-bit) floating-point element in b down to
7118// an integer value, store the result as a single-precision floating-point
7119// element in the lower element of dst, and copy the upper 3 packed elements
7120// from a to the upper elements of dst.
7121// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7122FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7123{
7124 return _mm_move_ss(a, _mm_floor_ps(b));
7125}
7126
7127// Copy a to dst, and insert the 32-bit integer i into dst at the location
7128// specified by imm8.
7129// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7130// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7131// __constrange(0,4) int imm)
7132#define _mm_insert_epi32(a, b, imm) \
7133 vreinterpretq_m128i_s32( \
7134 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7135
7136// Copy a to dst, and insert the 64-bit integer i into dst at the location
7137// specified by imm8.
7138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7139// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7140// __constrange(0,2) int imm)
7141#define _mm_insert_epi64(a, b, imm) \
7142 vreinterpretq_m128i_s64( \
7143 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7144
7145// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7146// location specified by imm8.
7147// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7148// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7149// __constrange(0,16) int imm)
7150#define _mm_insert_epi8(a, b, imm) \
7151 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7152
7153// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7154// element from b into tmp using the control in imm8. Store tmp to dst using
7155// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7156// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7157#define _mm_insert_ps(a, b, imm8) \
7158 _sse2neon_define2( \
7159 __m128, a, b, \
7160 float32x4_t tmp1 = \
7161 vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
7162 vreinterpretq_f32_m128(_a), 0); \
7163 float32x4_t tmp2 = \
7164 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7165 vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7166 const uint32_t data[4] = \
7167 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7168 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7169 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7170 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7171 uint32x4_t mask = vld1q_u32(data); \
7172 float32x4_t all_zeros = vdupq_n_f32(0); \
7173 \
7174 _sse2neon_return(vreinterpretq_m128_f32( \
7175 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7176
7177// Compare packed signed 32-bit integers in a and b, and store packed maximum
7178// values in dst.
7179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7180FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7181{
7182 return vreinterpretq_m128i_s32(
7183 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7184}
7185
7186// Compare packed signed 8-bit integers in a and b, and store packed maximum
7187// values in dst.
7188// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7189FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7190{
7191 return vreinterpretq_m128i_s8(
7192 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7193}
7194
7195// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7196// values in dst.
7197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7198FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7199{
7200 return vreinterpretq_m128i_u16(
7201 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7202}
7203
7204// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7205// values in dst.
7206// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7207FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7208{
7209 return vreinterpretq_m128i_u32(
7210 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7211}
7212
7213// Compare packed signed 32-bit integers in a and b, and store packed minimum
7214// values in dst.
7215// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7216FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7217{
7218 return vreinterpretq_m128i_s32(
7219 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7220}
7221
7222// Compare packed signed 8-bit integers in a and b, and store packed minimum
7223// values in dst.
7224// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7225FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7226{
7227 return vreinterpretq_m128i_s8(
7228 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7229}
7230
7231// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7232// values in dst.
7233// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7234FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7235{
7236 return vreinterpretq_m128i_u16(
7237 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7238}
7239
7240// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7241// values in dst.
7242// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7243FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7244{
7245 return vreinterpretq_m128i_u32(
7246 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7247}
7248
7249// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7250// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7251// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7252FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7253{
7254 __m128i dst;
7255 uint16_t min, idx = 0;
7256#if defined(__aarch64__) || defined(_M_ARM64)
7257 // Find the minimum value
7258 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7259
7260 // Get the index of the minimum value
7261 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7262 uint16x8_t minv = vdupq_n_u16(min);
7263 uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7264 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7265#else
7266 // Find the minimum value
7267 __m64 tmp;
7268 tmp = vreinterpret_m64_u16(
7269 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7270 vget_high_u16(vreinterpretq_u16_m128i(a))));
7271 tmp = vreinterpret_m64_u16(
7272 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7273 tmp = vreinterpret_m64_u16(
7274 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7275 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7276 // Get the index of the minimum value
7277 int i;
7278 for (i = 0; i < 8; i++) {
7279 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7280 idx = (uint16_t) i;
7281 break;
7282 }
7283 a = _mm_srli_si128(a, 2);
7284 }
7285#endif
7286 // Generate result
7287 dst = _mm_setzero_si128();
7288 dst = vreinterpretq_m128i_u16(
7289 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7290 dst = vreinterpretq_m128i_u16(
7291 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7292 return dst;
7293}
7294
7295// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7296// 8-bit integers in a compared to those in b, and store the 16-bit results in
7297// dst. Eight SADs are performed using one quadruplet from b and eight
7298// quadruplets from a. One quadruplet is selected from b starting at on the
7299// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7300// integers selected from a starting at the offset specified in imm8.
7301// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7302FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7303{
7304 uint8x16_t _a, _b;
7305
7306 switch (imm & 0x4) {
7307 case 0:
7308 // do nothing
7309 _a = vreinterpretq_u8_m128i(a);
7310 break;
7311 case 4:
7312 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7313 vreinterpretq_u32_m128i(a), 1));
7314 break;
7315 default:
7316#if defined(__GNUC__) || defined(__clang__)
7317 __builtin_unreachable();
7318#elif defined(_MSC_VER)
7319 __assume(0);
7320#endif
7321 break;
7322 }
7323
7324 switch (imm & 0x3) {
7325 case 0:
7326 _b = vreinterpretq_u8_u32(
7327 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7328 break;
7329 case 1:
7330 _b = vreinterpretq_u8_u32(
7331 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7332 break;
7333 case 2:
7334 _b = vreinterpretq_u8_u32(
7335 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7336 break;
7337 case 3:
7338 _b = vreinterpretq_u8_u32(
7339 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7340 break;
7341 default:
7342#if defined(__GNUC__) || defined(__clang__)
7343 __builtin_unreachable();
7344#elif defined(_MSC_VER)
7345 __assume(0);
7346#endif
7347 break;
7348 }
7349
7350 int16x8_t c04, c15, c26, c37;
7351 uint8x8_t low_b = vget_low_u8(_b);
7352 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7353 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7354 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7355 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7356 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7357 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7358 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7359#if defined(__aarch64__) || defined(_M_ARM64)
7360 // |0|4|2|6|
7361 c04 = vpaddq_s16(c04, c26);
7362 // |1|5|3|7|
7363 c15 = vpaddq_s16(c15, c37);
7364
7365 int32x4_t trn1_c =
7366 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7367 int32x4_t trn2_c =
7368 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7369 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7370 vreinterpretq_s16_s32(trn2_c)));
7371#else
7372 int16x4_t c01, c23, c45, c67;
7373 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7374 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7375 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7376 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7377
7378 return vreinterpretq_m128i_s16(
7379 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7380#endif
7381}
7382
7383// Multiply the low signed 32-bit integers from each packed 64-bit element in
7384// a and b, and store the signed 64-bit results in dst.
7385// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7386FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7387{
7388 // vmull_s32 upcasts instead of masking, so we downcast.
7389 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7390 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7391 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7392}
7393
7394// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7395// integers, and store the low 32 bits of the intermediate integers in dst.
7396// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7397FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7398{
7399 return vreinterpretq_m128i_s32(
7400 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7401}
7402
7403// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7404// using unsigned saturation, and store the results in dst.
7405// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7406FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7407{
7408 return vreinterpretq_m128i_u16(
7409 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7410 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7411}
7412
7413// Round the packed double-precision (64-bit) floating-point elements in a using
7414// the rounding parameter, and store the results as packed double-precision
7415// floating-point elements in dst.
7416// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7417FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7418{
7419#if defined(__aarch64__) || defined(_M_ARM64)
7420 switch (rounding) {
7421 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7422 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7423 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7424 return _mm_floor_pd(a);
7425 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7426 return _mm_ceil_pd(a);
7427 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7428 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7429 default: //_MM_FROUND_CUR_DIRECTION
7430 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7431 }
7432#else
7433 double *v_double = (double *) &a;
7434
7435 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7436 (rounding == _MM_FROUND_CUR_DIRECTION &&
7437 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7438 double res[2], tmp;
7439 for (int i = 0; i < 2; i++) {
7440 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7441 double roundDown = floor(tmp); // Round down value
7442 double roundUp = ceil(tmp); // Round up value
7443 double diffDown = tmp - roundDown;
7444 double diffUp = roundUp - tmp;
7445 if (diffDown < diffUp) {
7446 /* If it's closer to the round down value, then use it */
7447 res[i] = roundDown;
7448 } else if (diffDown > diffUp) {
7449 /* If it's closer to the round up value, then use it */
7450 res[i] = roundUp;
7451 } else {
7452 /* If it's equidistant between round up and round down value,
7453 * pick the one which is an even number */
7454 double half = roundDown / 2;
7455 if (half != floor(half)) {
7456 /* If the round down value is odd, return the round up value
7457 */
7458 res[i] = roundUp;
7459 } else {
7460 /* If the round up value is odd, return the round down value
7461 */
7462 res[i] = roundDown;
7463 }
7464 }
7465 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7466 }
7467 return _mm_set_pd(res[1], res[0]);
7468 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7469 (rounding == _MM_FROUND_CUR_DIRECTION &&
7470 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7471 return _mm_floor_pd(a);
7472 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7473 (rounding == _MM_FROUND_CUR_DIRECTION &&
7474 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7475 return _mm_ceil_pd(a);
7476 }
7477 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7478 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7479#endif
7480}
7481
7482// Round the packed single-precision (32-bit) floating-point elements in a using
7483// the rounding parameter, and store the results as packed single-precision
7484// floating-point elements in dst.
7485// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7486FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7487{
7488#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7489 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7490 switch (rounding) {
7491 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7492 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7493 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7494 return _mm_floor_ps(a);
7495 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7496 return _mm_ceil_ps(a);
7497 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7498 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7499 default: //_MM_FROUND_CUR_DIRECTION
7500 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7501 }
7502#else
7503 float *v_float = (float *) &a;
7504
7505 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7506 (rounding == _MM_FROUND_CUR_DIRECTION &&
7507 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7508 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7509 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7510 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7511 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7512 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7513 int32x4_t r_trunc = vcvtq_s32_f32(
7514 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7515 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7516 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7517 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7518 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7519 float32x4_t delta = vsubq_f32(
7520 vreinterpretq_f32_m128(a),
7521 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7522 uint32x4_t is_delta_half =
7523 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7524 return vreinterpretq_m128_f32(
7525 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7526 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7527 (rounding == _MM_FROUND_CUR_DIRECTION &&
7528 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7529 return _mm_floor_ps(a);
7530 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7531 (rounding == _MM_FROUND_CUR_DIRECTION &&
7532 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7533 return _mm_ceil_ps(a);
7534 }
7535 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7536 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7537 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7538 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7539#endif
7540}
7541
7542// Round the lower double-precision (64-bit) floating-point element in b using
7543// the rounding parameter, store the result as a double-precision floating-point
7544// element in the lower element of dst, and copy the upper element from a to the
7545// upper element of dst.
7546// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7547FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7548{
7549 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7550}
7551
7552// Round the lower single-precision (32-bit) floating-point element in b using
7553// the rounding parameter, store the result as a single-precision floating-point
7554// element in the lower element of dst, and copy the upper 3 packed elements
7555// from a to the upper elements of dst. Rounding is done according to the
7556// rounding[3:0] parameter, which can be one of:
7557// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7558// suppress exceptions
7559// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7560// suppress exceptions
7561// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7562// exceptions
7563// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7564// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7565// _MM_SET_ROUNDING_MODE
7566// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7567FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7568{
7569 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7570}
7571
7572// Load 128-bits of integer data from memory into dst using a non-temporal
7573// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7574// general-protection exception may be generated.
7575// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7576FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7577{
7578#if __has_builtin(__builtin_nontemporal_store)
7579 return __builtin_nontemporal_load(p);
7580#else
7581 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7582#endif
7583}
7584
7585// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7586// all 1's, and return 1 if the result is zero, otherwise return 0.
7587// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7588FORCE_INLINE int _mm_test_all_ones(__m128i a)
7589{
7590 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7591 ~(uint64_t) 0;
7592}
7593
7594// Compute the bitwise AND of 128 bits (representing integer data) in a and
7595// mask, and return 1 if the result is zero, otherwise return 0.
7596// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7597FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7598{
7599 int64x2_t a_and_mask =
7600 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7601 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7602}
7603
7604// Compute the bitwise AND of 128 bits (representing integer data) in a and
7605// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7606// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7607// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7608// otherwise return 0.
7609// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7610// Note: Argument names may be wrong in the Intel intrinsics guide.
7611FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7612{
7613 uint64x2_t v = vreinterpretq_u64_m128i(a);
7614 uint64x2_t m = vreinterpretq_u64_m128i(mask);
7615
7616 // find ones (set-bits) and zeros (clear-bits) under clip mask
7617 uint64x2_t ones = vandq_u64(m, v);
7618 uint64x2_t zeros = vbicq_u64(m, v);
7619
7620 // If both 128-bit variables are populated (non-zero) then return 1.
7621 // For comparision purposes, first compact each var down to 32-bits.
7622 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7623
7624 // if folding minimum is non-zero then both vars must be non-zero
7625 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7626}
7627
7628// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7629// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7630// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7631// otherwise set CF to 0. Return the CF value.
7632// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7633FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7634{
7635 int64x2_t s64 =
7636 vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
7637 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7638}
7639
7640// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7641// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7642// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7643// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7644// otherwise return 0.
7645// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7646#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7647
7648// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7649// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7650// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7651// otherwise set CF to 0. Return the ZF value.
7652// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7653FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7654{
7655 int64x2_t s64 =
7656 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
7657 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7658}
7659
7660/* SSE4.2 */
7661
7662static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7663 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7664};
7665static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7666 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7667 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7668};
7669
7670/* specify the source data format */
7671#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7672#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7673#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7674#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7675
7676/* specify the comparison operation */
7677#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
7678#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
7679#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
7680#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7681
7682/* specify the polarity */
7683#define _SIDD_POSITIVE_POLARITY 0x00
7684#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7685#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7686#define _SIDD_MASKED_NEGATIVE_POLARITY \
7687 0x30 /* negate results only before end of string */
7688
7689/* specify the output selection in _mm_cmpXstri */
7690#define _SIDD_LEAST_SIGNIFICANT 0x00
7691#define _SIDD_MOST_SIGNIFICANT 0x40
7692
7693/* specify the output selection in _mm_cmpXstrm */
7694#define _SIDD_BIT_MASK 0x00
7695#define _SIDD_UNIT_MASK 0x40
7696
7697/* Pattern Matching for C macros.
7698 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7699 */
7700
7701/* catenate */
7702#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7703#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7704
7705#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7706/* run the 2nd parameter */
7707#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7708/* run the 1st parameter */
7709#define SSE2NEON_IIF_1(t, ...) t
7710
7711#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7712#define SSE2NEON_COMPL_0 1
7713#define SSE2NEON_COMPL_1 0
7714
7715#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7716#define SSE2NEON_DEC_1 0
7717#define SSE2NEON_DEC_2 1
7718#define SSE2NEON_DEC_3 2
7719#define SSE2NEON_DEC_4 3
7720#define SSE2NEON_DEC_5 4
7721#define SSE2NEON_DEC_6 5
7722#define SSE2NEON_DEC_7 6
7723#define SSE2NEON_DEC_8 7
7724#define SSE2NEON_DEC_9 8
7725#define SSE2NEON_DEC_10 9
7726#define SSE2NEON_DEC_11 10
7727#define SSE2NEON_DEC_12 11
7728#define SSE2NEON_DEC_13 12
7729#define SSE2NEON_DEC_14 13
7730#define SSE2NEON_DEC_15 14
7731#define SSE2NEON_DEC_16 15
7732
7733/* detection */
7734#define SSE2NEON_CHECK_N(x, n, ...) n
7735#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7736#define SSE2NEON_PROBE(x) x, 1,
7737
7738#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7739#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7740
7741#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7742#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7743
7744#define SSE2NEON_EAT(...)
7745#define SSE2NEON_EXPAND(...) __VA_ARGS__
7746#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7747
7748/* recursion */
7749/* deferred expression */
7750#define SSE2NEON_EMPTY()
7751#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7752#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7753#define SSE2NEON_EXPAND(...) __VA_ARGS__
7754
7755#define SSE2NEON_EVAL(...) \
7756 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7757#define SSE2NEON_EVAL1(...) \
7758 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7759#define SSE2NEON_EVAL2(...) \
7760 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7761#define SSE2NEON_EVAL3(...) __VA_ARGS__
7762
7763#define SSE2NEON_REPEAT(count, macro, ...) \
7764 SSE2NEON_WHEN(count) \
7765 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7766 SSE2NEON_DEC(count), macro, \
7767 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7768 __VA_ARGS__))
7769#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7770
7771#define SSE2NEON_SIZE_OF_byte 8
7772#define SSE2NEON_NUMBER_OF_LANES_byte 16
7773#define SSE2NEON_SIZE_OF_word 16
7774#define SSE2NEON_NUMBER_OF_LANES_word 8
7775
7776#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7777 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7778 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7779 vreinterpretq_##type##_m128i(a)));
7780
7781#define SSE2NEON_FILL_LANE(i, type) \
7782 vec_b[i] = \
7783 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7784
7785#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7786 number_of_lanes, byte_or_word) \
7787 do { \
7788 SSE2NEON_CAT( \
7789 data_type_prefix, \
7790 SSE2NEON_CAT(size, \
7791 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7792 vec_b[number_of_lanes]; \
7793 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7794 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7795 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7796 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7797 SSE2NEON_CAT(type_prefix, size))) \
7798 for (int i = 0; i < number_of_lanes; i++) { \
7799 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7800 size)(SSE2NEON_CAT(vbslq_u, size)( \
7801 SSE2NEON_CAT(vreinterpretq_u, \
7802 SSE2NEON_CAT(size, _m128i))(mask), \
7803 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7804 vec_b[i], \
7805 SSE2NEON_CAT( \
7806 vreinterpretq_, \
7807 SSE2NEON_CAT(type_prefix, \
7808 SSE2NEON_CAT(size, _m128i(a))))), \
7809 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7810 vec_b[i], \
7811 SSE2NEON_CAT( \
7812 vreinterpretq_, \
7813 SSE2NEON_CAT(type_prefix, \
7814 SSE2NEON_CAT(size, _m128i(a))))))); \
7815 } \
7816 } while (0)
7817
7818#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7819 do { \
7820 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7821 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7822 SSE2NEON_CAT(u, size))) \
7823 } while (0)
7824
7825#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7826 static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7827 int lb) \
7828 { \
7829 __m128i mtx[16]; \
7830 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7831 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7832 return SSE2NEON_CAT( \
7833 _sse2neon_aggregate_equal_any_, \
7834 SSE2NEON_CAT( \
7835 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7836 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7837 type))))(la, lb, mtx); \
7838 }
7839
7840#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7841 static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7842 int lb) \
7843 { \
7844 __m128i mtx[16]; \
7845 PCMPSTR_RANGES( \
7846 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7847 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7848 return SSE2NEON_CAT( \
7849 _sse2neon_aggregate_ranges_, \
7850 SSE2NEON_CAT( \
7851 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7852 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7853 type))))(la, lb, mtx); \
7854 }
7855
7856#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
7857 static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
7858 __m128i b, int lb) \
7859 { \
7860 __m128i mtx[16]; \
7861 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7862 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7863 return SSE2NEON_CAT( \
7864 _sse2neon_aggregate_equal_ordered_, \
7865 SSE2NEON_CAT( \
7866 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7867 SSE2NEON_CAT(x, \
7868 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7869 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
7870 }
7871
7872static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
7873{
7874 int res = 0;
7875 int m = (1 << la) - 1;
7876 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7877 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7878 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7879 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7880 for (int j = 0; j < lb; j++) {
7881 mtx[j] = vreinterpretq_m128i_u8(
7882 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7883 mtx[j] = vreinterpretq_m128i_u8(
7884 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7885 int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
7886 res |= (tmp << j);
7887 }
7888 return res;
7889}
7890
7891static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
7892{
7893 int res = 0;
7894 int m = (1 << la) - 1;
7895 uint16x8_t vec =
7896 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7897 for (int j = 0; j < lb; j++) {
7898 mtx[j] = vreinterpretq_m128i_u16(
7899 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7900 mtx[j] = vreinterpretq_m128i_u16(
7901 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7902 int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
7903 res |= (tmp << j);
7904 }
7905 return res;
7906}
7907
7908/* clang-format off */
7909#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7910 prefix##IMPL(byte) \
7911 prefix##IMPL(word)
7912/* clang-format on */
7913
7914SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
7915
7916static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
7917{
7918 int res = 0;
7919 int m = (1 << la) - 1;
7920 uint16x8_t vec =
7921 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7922 for (int j = 0; j < lb; j++) {
7923 mtx[j] = vreinterpretq_m128i_u16(
7924 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7925 mtx[j] = vreinterpretq_m128i_u16(
7926 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7927 __m128i tmp = vreinterpretq_m128i_u32(
7928 vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
7929 uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
7930 vreinterpretq_u32_m128i(tmp));
7931#if defined(__aarch64__) || defined(_M_ARM64)
7932 int t = vaddvq_u32(vec_res) ? 1 : 0;
7933#else
7934 uint64x2_t sumh = vpaddlq_u32(vec_res);
7935 int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7936#endif
7937 res |= (t << j);
7938 }
7939 return res;
7940}
7941
7942static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
7943{
7944 int res = 0;
7945 int m = (1 << la) - 1;
7946 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7947 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7948 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7949 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7950 for (int j = 0; j < lb; j++) {
7951 mtx[j] = vreinterpretq_m128i_u8(
7952 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7953 mtx[j] = vreinterpretq_m128i_u8(
7954 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7955 __m128i tmp = vreinterpretq_m128i_u16(
7956 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
7957 uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
7958 vreinterpretq_u16_m128i(tmp));
7959 int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
7960 res |= (t << j);
7961 }
7962 return res;
7963}
7964
7965#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7966#define SSE2NEON_CMP_RANGES_IS_WORD 0
7967
7968/* clang-format off */
7969#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
7970 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
7971 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
7972 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
7973 prefix##IMPL(word, int, s, prefix##IS_WORD)
7974/* clang-format on */
7975
7976SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
7977
7978#undef SSE2NEON_CMP_RANGES_IS_BYTE
7979#undef SSE2NEON_CMP_RANGES_IS_WORD
7980
7981static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
7982{
7983 uint8x16_t mtx =
7984 vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
7985 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7986 int m1 = 0x10000 - (1 << la);
7987 int tb = 0x10000 - (1 << lb);
7988 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7989 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7990 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7991 vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7992 vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7993 vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7994 vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7995 tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7996 tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7997
7998 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7999 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
8000 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
8001 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
8002 res_lo = vand_u8(res_lo, vec_mask);
8003 res_hi = vand_u8(res_hi, vec_mask);
8004
8005 int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
8006 return res;
8007}
8008
8009static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
8010{
8011 uint16x8_t mtx =
8012 vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
8013 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8014 int m1 = 0x100 - (1 << la);
8015 int tb = 0x100 - (1 << lb);
8016 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8017 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8018 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8019 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8020 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8021 mtx = vbslq_u16(vec1, tmp, mtx);
8022 mtx = vandq_u16(mtx, vec_mask);
8023 return _sse2neon_vaddvq_u16(mtx);
8024}
8025
8026#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8027#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8028
8029#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8030 static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8031 int bound, int la, int lb, __m128i mtx[16]) \
8032 { \
8033 int res = 0; \
8034 int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8035 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8036 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8037 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8038 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8039 vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8040 vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8041 vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8042 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8043 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8044 for (int j = 0; j < lb; j++) { \
8045 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8046 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8047 } \
8048 for (int j = lb; j < bound; j++) { \
8049 mtx[j] = vreinterpretq_m128i_u##size( \
8050 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8051 } \
8052 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8053 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8054 for (int i = 0; i < bound; i++) { \
8055 int val = 1; \
8056 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8057 val &= ptr[k * bound + j]; \
8058 res += val << i; \
8059 } \
8060 return res; \
8061 }
8062
8063/* clang-format off */
8064#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8065 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8066 prefix##IMPL(16, 8, prefix##IS_UWORD)
8067/* clang-format on */
8068
8069SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8070
8071#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8072#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8073
8074/* clang-format off */
8075#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8076 prefix##IMPL(byte) \
8077 prefix##IMPL(word)
8078/* clang-format on */
8079
8080SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8081
8082#define SSE2NEON_CMPESTR_LIST \
8083 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8084 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8085 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8086 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8087 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8088 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8089 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8090 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8091 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8092 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8093 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8094 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8095 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8096 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8097 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8098 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8099
8100enum {
8101#define _(name, func_suffix) name,
8102 SSE2NEON_CMPESTR_LIST
8103#undef _
8104};
8105typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8106static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
8107#define _(name, func_suffix) _sse2neon_##func_suffix,
8108 SSE2NEON_CMPESTR_LIST
8109#undef _
8110};
8111
8112FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
8113{
8114 switch (imm8 & 0x30) {
8115 case _SIDD_NEGATIVE_POLARITY:
8116 res ^= 0xffffffff;
8117 break;
8118 case _SIDD_MASKED_NEGATIVE_POLARITY:
8119 res ^= (1 << lb) - 1;
8120 break;
8121 default:
8122 break;
8123 }
8124
8125 return res & ((bound == 8) ? 0xFF : 0xFFFF);
8126}
8127
8128FORCE_INLINE int _sse2neon_clz(unsigned int x)
8129{
8130#ifdef _MSC_VER
8131 unsigned long cnt = 0;
8132 if (_BitScanReverse(&cnt, x))
8133 return 31 - cnt;
8134 return 32;
8135#else
8136 return x != 0 ? __builtin_clz(x) : 32;
8137#endif
8138}
8139
8140FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8141{
8142#ifdef _MSC_VER
8143 unsigned long cnt = 0;
8144 if (_BitScanForward(&cnt, x))
8145 return cnt;
8146 return 32;
8147#else
8148 return x != 0 ? __builtin_ctz(x) : 32;
8149#endif
8150}
8151
8152FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8153{
8154#ifdef _MSC_VER
8155 unsigned long cnt;
8156#if defined(SSE2NEON_HAS_BITSCAN64)
8157 if (_BitScanForward64(&cnt, x))
8158 return (int) (cnt);
8159#else
8160 if (_BitScanForward(&cnt, (unsigned long) (x)))
8161 return (int) cnt;
8162 if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8163 return (int) (cnt + 32);
8164#endif /* SSE2NEON_HAS_BITSCAN64 */
8165 return 64;
8166#else /* assume GNU compatible compilers */
8167 return x != 0 ? __builtin_ctzll(x) : 64;
8168#endif
8169}
8170
8171#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8172
8173#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8174 const int var = (imm & 0x01) ? 8 : 16
8175
8176#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8177 int tmp1 = la ^ (la >> 31); \
8178 la = tmp1 - (la >> 31); \
8179 int tmp2 = lb ^ (lb >> 31); \
8180 lb = tmp2 - (lb >> 31); \
8181 la = SSE2NEON_MIN(la, bound); \
8182 lb = SSE2NEON_MIN(lb, bound)
8183
8184// Compare all pairs of character in string a and b,
8185// then aggregate the result.
8186// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8187// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8188// string a and b.
8189#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8190 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8191 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8192 int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8193 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8194
8195#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8196 return (r2 == 0) ? bound \
8197 : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8198 : _sse2neon_ctz(r2))
8199
8200#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8201 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8202 if (imm8 & 0x40) { \
8203 if (bound == 8) { \
8204 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8205 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8206 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8207 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8208 } else { \
8209 uint8x16_t vec_r2 = \
8210 vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
8211 uint8x16_t tmp = \
8212 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8213 dst = vreinterpretq_m128i_u8( \
8214 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8215 } \
8216 } else { \
8217 if (bound == 16) { \
8218 dst = vreinterpretq_m128i_u16( \
8219 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8220 } else { \
8221 dst = vreinterpretq_m128i_u8( \
8222 vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
8223 } \
8224 } \
8225 return dst
8226
8227// Compare packed strings in a and b with lengths la and lb using the control
8228// in imm8, and returns 1 if b did not contain a null character and the
8229// resulting mask was zero, and 0 otherwise.
8230// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8231FORCE_INLINE int _mm_cmpestra(__m128i a,
8232 int la,
8233 __m128i b,
8234 int lb,
8235 const int imm8)
8236{
8237 int lb_cpy = lb;
8238 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8239 return !r2 & (lb_cpy > bound);
8240}
8241
8242// Compare packed strings in a and b with lengths la and lb using the control in
8243// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8244// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8245FORCE_INLINE int _mm_cmpestrc(__m128i a,
8246 int la,
8247 __m128i b,
8248 int lb,
8249 const int imm8)
8250{
8251 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8252 return r2 != 0;
8253}
8254
8255// Compare packed strings in a and b with lengths la and lb using the control
8256// in imm8, and store the generated index in dst.
8257// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8258FORCE_INLINE int _mm_cmpestri(__m128i a,
8259 int la,
8260 __m128i b,
8261 int lb,
8262 const int imm8)
8263{
8264 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8265 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8266}
8267
8268// Compare packed strings in a and b with lengths la and lb using the control
8269// in imm8, and store the generated mask in dst.
8270// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8271FORCE_INLINE __m128i
8272_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8273{
8274 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8275 SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8276}
8277
8278// Compare packed strings in a and b with lengths la and lb using the control in
8279// imm8, and returns bit 0 of the resulting bit mask.
8280// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8281FORCE_INLINE int _mm_cmpestro(__m128i a,
8282 int la,
8283 __m128i b,
8284 int lb,
8285 const int imm8)
8286{
8287 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8288 return r2 & 1;
8289}
8290
8291// Compare packed strings in a and b with lengths la and lb using the control in
8292// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8293// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8294FORCE_INLINE int _mm_cmpestrs(__m128i a,
8295 int la,
8296 __m128i b,
8297 int lb,
8298 const int imm8)
8299{
8300 (void) a;
8301 (void) b;
8302 (void) lb;
8303 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8304 return la <= (bound - 1);
8305}
8306
8307// Compare packed strings in a and b with lengths la and lb using the control in
8308// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8309// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8310FORCE_INLINE int _mm_cmpestrz(__m128i a,
8311 int la,
8312 __m128i b,
8313 int lb,
8314 const int imm8)
8315{
8316 (void) a;
8317 (void) b;
8318 (void) la;
8319 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8320 return lb <= (bound - 1);
8321}
8322
8323#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8324 do { \
8325 if (imm8 & 0x01) { \
8326 uint16x8_t equal_mask_##str = \
8327 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8328 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8329 uint64_t matches_##str = \
8330 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8331 len = _sse2neon_ctzll(matches_##str) >> 3; \
8332 } else { \
8333 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8334 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8335 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8336 uint64_t matches_##str = \
8337 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8338 len = _sse2neon_ctzll(matches_##str) >> 2; \
8339 } \
8340 } while (0)
8341
8342#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8343 int la, lb; \
8344 do { \
8345 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8346 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8347 } while (0)
8348
8349// Compare packed strings with implicit lengths in a and b using the control in
8350// imm8, and returns 1 if b did not contain a null character and the resulting
8351// mask was zero, and 0 otherwise.
8352// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8353FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8354{
8355 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8356 return !r2 & (lb >= bound);
8357}
8358
8359// Compare packed strings with implicit lengths in a and b using the control in
8360// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8361// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8362FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8363{
8364 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8365 return r2 != 0;
8366}
8367
8368// Compare packed strings with implicit lengths in a and b using the control in
8369// imm8, and store the generated index in dst.
8370// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8371FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8372{
8373 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8374 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8375}
8376
8377// Compare packed strings with implicit lengths in a and b using the control in
8378// imm8, and store the generated mask in dst.
8379// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8380FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8381{
8382 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8383 SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8384}
8385
8386// Compare packed strings with implicit lengths in a and b using the control in
8387// imm8, and returns bit 0 of the resulting bit mask.
8388// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8389FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8390{
8391 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8392 return r2 & 1;
8393}
8394
8395// Compare packed strings with implicit lengths in a and b using the control in
8396// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8397// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8398FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8399{
8400 (void) b;
8401 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8402 int la;
8403 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8404 return la <= (bound - 1);
8405}
8406
8407// Compare packed strings with implicit lengths in a and b using the control in
8408// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8409// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8410FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8411{
8412 (void) a;
8413 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8414 int lb;
8415 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8416 return lb <= (bound - 1);
8417}
8418
8419// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8420// in b for greater than.
8421FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8422{
8423#if defined(__aarch64__) || defined(_M_ARM64)
8424 return vreinterpretq_m128i_u64(
8425 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8426#else
8427 return vreinterpretq_m128i_s64(vshrq_n_s64(
8428 vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8429 63));
8430#endif
8431}
8432
8433// Starting with the initial value in crc, accumulates a CRC32 value for
8434// unsigned 16-bit integer v, and stores the result in dst.
8435// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8436FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8437{
8438#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8439 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8440 : [c] "+r"(crc)
8441 : [v] "r"(v));
8442#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8443 (defined(_M_ARM64) && !defined(__clang__))
8444 crc = __crc32ch(crc, v);
8445#else
8446 crc = _mm_crc32_u8(crc, v & 0xff);
8447 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8448#endif
8449 return crc;
8450}
8451
8452// Starting with the initial value in crc, accumulates a CRC32 value for
8453// unsigned 32-bit integer v, and stores the result in dst.
8454// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8455FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8456{
8457#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8458 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8459 : [c] "+r"(crc)
8460 : [v] "r"(v));
8461#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8462 (defined(_M_ARM64) && !defined(__clang__))
8463 crc = __crc32cw(crc, v);
8464#else
8465 crc = _mm_crc32_u16(crc, v & 0xffff);
8466 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8467#endif
8468 return crc;
8469}
8470
8471// Starting with the initial value in crc, accumulates a CRC32 value for
8472// unsigned 64-bit integer v, and stores the result in dst.
8473// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8474FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8475{
8476#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8477 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8478 : [c] "+r"(crc)
8479 : [v] "r"(v));
8480#elif (defined(_M_ARM64) && !defined(__clang__))
8481 crc = __crc32cd((uint32_t) crc, v);
8482#else
8483 crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8484 crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8485#endif
8486 return crc;
8487}
8488
8489// Starting with the initial value in crc, accumulates a CRC32 value for
8490// unsigned 8-bit integer v, and stores the result in dst.
8491// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8492FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8493{
8494#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8495 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8496 : [c] "+r"(crc)
8497 : [v] "r"(v));
8498#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8499 (defined(_M_ARM64) && !defined(__clang__))
8500 crc = __crc32cb(crc, v);
8501#else
8502 crc ^= v;
8503#if defined(__ARM_FEATURE_CRYPTO)
8504 // Adapted from: https://mary.rs/lab/crc32/
8505 // Barrent reduction
8506 uint64x2_t orig =
8507 vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
8508 uint64x2_t tmp = orig;
8509
8510 // Polynomial P(x) of CRC32C
8511 uint64_t p = 0x105EC76F1;
8512 // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
8513 // 2^{64} / P(x) \rfloor = 0x11f91caf6
8514 uint64_t mu = 0x1dea713f1;
8515
8516 // Multiply by mu_{64}
8517 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
8518 // Divide by 2^{64} (mask away the unnecessary bits)
8519 tmp =
8520 vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
8521 // Multiply by P(x) (shifted left by 1 for alignment reasons)
8522 tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
8523 // Subtract original from result
8524 tmp = veorq_u64(tmp, orig);
8525
8526 // Extract the 'lower' (in bit-reflected sense) 32 bits
8527 crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
8528#else // Fall back to the generic table lookup approach
8529 // Adapted from: https://create.stephan-brumme.com/crc32/
8530 // Apply half-byte comparision algorithm for the best ratio between
8531 // performance and lookup table.
8532
8533 // The lookup table just needs to store every 16th entry
8534 // of the standard look-up table.
8535 static const uint32_t crc32_half_byte_tbl[] = {
8536 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
8537 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
8538 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
8539 };
8540
8541 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8542 crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
8543#endif
8544#endif
8545 return crc;
8546}
8547
8548/* AES */
8549
8550#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8551/* clang-format off */
8552#define SSE2NEON_AES_SBOX(w) \
8553 { \
8554 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8555 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8556 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8557 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8558 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8559 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8560 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8561 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8562 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8563 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8564 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8565 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8566 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8567 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8568 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8569 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8570 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8571 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8572 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8573 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8574 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8575 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8576 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8577 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8578 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8579 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8580 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8581 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8582 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8583 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8584 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8585 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8586 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8587 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8588 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8589 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8590 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8591 }
8592#define SSE2NEON_AES_RSBOX(w) \
8593 { \
8594 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8595 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8596 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8597 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8598 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8599 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8600 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8601 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8602 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8603 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8604 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8605 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8606 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8607 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8608 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8609 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8610 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8611 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8612 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8613 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8614 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8615 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8616 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8617 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8618 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8619 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8620 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8621 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8622 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8623 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8624 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8625 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8626 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8627 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8628 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8629 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8630 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8631 }
8632/* clang-format on */
8633
8634/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8635#define SSE2NEON_AES_H0(x) (x)
8636static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
8637static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
8638#undef SSE2NEON_AES_H0
8639
8640/* x_time function and matrix multiply function */
8641#if !defined(__aarch64__) && !defined(_M_ARM64)
8642#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8643#define SSE2NEON_MULTIPLY(x, y) \
8644 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8645 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8646 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8647 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8648#endif
8649
8650// In the absence of crypto extensions, implement aesenc using regular NEON
8651// intrinsics instead. See:
8652// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8653// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8654// for more information.
8655FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8656{
8657#if defined(__aarch64__) || defined(_M_ARM64)
8658 static const uint8_t shift_rows[] = {
8659 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8660 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8661 };
8662 static const uint8_t ror32by8[] = {
8663 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8664 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8665 };
8666
8667 uint8x16_t v;
8668 uint8x16_t w = vreinterpretq_u8_m128i(a);
8669
8670 /* shift rows */
8671 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8672
8673 /* sub bytes */
8674 // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8675 // look up each of the table. After each lookup, we load the next table
8676 // which locates at the next 64-bytes. In the meantime, the index in the
8677 // table would be smaller than it was, so the index parameters of
8678 // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8679 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8680 // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8681 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8682 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8683 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8684
8685 /* mix columns */
8686 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8687 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8688 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8689
8690 /* add round key */
8691 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8692
8693#else /* ARMv7-A implementation for a table-based AES */
8694#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8695 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8696 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8697// muliplying 'x' by 2 in GF(2^8)
8698#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8699// muliplying 'x' by 3 in GF(2^8)
8700#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8701#define SSE2NEON_AES_U0(p) \
8702 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8703#define SSE2NEON_AES_U1(p) \
8704 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8705#define SSE2NEON_AES_U2(p) \
8706 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8707#define SSE2NEON_AES_U3(p) \
8708 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8709
8710 // this generates a table containing every possible permutation of
8711 // shift_rows() and sub_bytes() with mix_columns().
8712 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8713 SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
8714 SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
8715 SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
8716 SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
8717 };
8718#undef SSE2NEON_AES_B2W
8719#undef SSE2NEON_AES_F2
8720#undef SSE2NEON_AES_F3
8721#undef SSE2NEON_AES_U0
8722#undef SSE2NEON_AES_U1
8723#undef SSE2NEON_AES_U2
8724#undef SSE2NEON_AES_U3
8725
8726 uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
8727 uint32_t x1 =
8728 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
8729 uint32_t x2 =
8730 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
8731 uint32_t x3 =
8732 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
8733
8734 // finish the modulo addition step in mix_columns()
8735 __m128i out = _mm_set_epi32(
8736 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8737 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8738 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8739 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8740 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8741 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8742 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8743 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8744
8745 return _mm_xor_si128(out, RoundKey);
8746#endif
8747}
8748
8749// Perform one round of an AES decryption flow on data (state) in a using the
8750// round key in RoundKey, and store the result in dst.
8751// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8752FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8753{
8754#if defined(__aarch64__)
8755 static const uint8_t inv_shift_rows[] = {
8756 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8757 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8758 };
8759 static const uint8_t ror32by8[] = {
8760 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8761 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8762 };
8763
8764 uint8x16_t v;
8765 uint8x16_t w = vreinterpretq_u8_m128i(a);
8766
8767 // inverse shift rows
8768 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8769
8770 // inverse sub bytes
8771 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8772 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8773 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8774 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8775
8776 // inverse mix columns
8777 // multiplying 'v' by 4 in GF(2^8)
8778 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8779 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8780 v ^= w;
8781 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8782
8783 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8784 0x1b); // muliplying 'v' by 2 in GF(2^8)
8785 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8786 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8787
8788 // add round key
8789 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8790
8791#else /* ARMv7-A NEON implementation */
8792 /* FIXME: optimized for NEON */
8793 uint8_t i, e, f, g, h, v[4][4];
8794 uint8_t *_a = (uint8_t *) &a;
8795 for (i = 0; i < 16; ++i) {
8796 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8797 }
8798
8799 // inverse mix columns
8800 for (i = 0; i < 4; ++i) {
8801 e = v[i][0];
8802 f = v[i][1];
8803 g = v[i][2];
8804 h = v[i][3];
8805
8806 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8807 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8808 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8809 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8810 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8811 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8812 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8813 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8814 }
8815
8816 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8817#endif
8818}
8819
8820// Perform the last round of an AES encryption flow on data (state) in a using
8821// the round key in RoundKey, and store the result in dst.
8822// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8823FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8824{
8825#if defined(__aarch64__)
8826 static const uint8_t shift_rows[] = {
8827 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8828 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8829 };
8830
8831 uint8x16_t v;
8832 uint8x16_t w = vreinterpretq_u8_m128i(a);
8833
8834 // shift rows
8835 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8836
8837 // sub bytes
8838 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8839 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8840 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8841 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8842
8843 // add round key
8844 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8845
8846#else /* ARMv7-A implementation */
8847 uint8_t v[16] = {
8848 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
8849 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
8850 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
8851 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
8852 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
8853 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
8854 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
8855 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
8856 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
8857 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
8858 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
8859 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
8860 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
8861 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
8862 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
8863 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
8864 };
8865
8866 return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
8867#endif
8868}
8869
8870// Perform the last round of an AES decryption flow on data (state) in a using
8871// the round key in RoundKey, and store the result in dst.
8872// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8873FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8874{
8875#if defined(__aarch64__)
8876 static const uint8_t inv_shift_rows[] = {
8877 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8878 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8879 };
8880
8881 uint8x16_t v;
8882 uint8x16_t w = vreinterpretq_u8_m128i(a);
8883
8884 // inverse shift rows
8885 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8886
8887 // inverse sub bytes
8888 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8889 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8890 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8891 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8892
8893 // add round key
8894 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8895
8896#else /* ARMv7-A NEON implementation */
8897 /* FIXME: optimized for NEON */
8898 uint8_t v[4][4];
8899 uint8_t *_a = (uint8_t *) &a;
8900 for (int i = 0; i < 16; ++i) {
8901 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8902 }
8903
8904 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8905#endif
8906}
8907
8908// Perform the InvMixColumns transformation on a and store the result in dst.
8909// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8910FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8911{
8912#if defined(__aarch64__)
8913 static const uint8_t ror32by8[] = {
8914 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8915 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8916 };
8917 uint8x16_t v = vreinterpretq_u8_m128i(a);
8918 uint8x16_t w;
8919
8920 // multiplying 'v' by 4 in GF(2^8)
8921 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8922 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8923 v ^= w;
8924 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8925
8926 // multiplying 'v' by 2 in GF(2^8)
8927 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8928 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8929 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8930 return vreinterpretq_m128i_u8(w);
8931
8932#else /* ARMv7-A NEON implementation */
8933 uint8_t i, e, f, g, h, v[4][4];
8934 vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
8935 for (i = 0; i < 4; ++i) {
8936 e = v[i][0];
8937 f = v[i][1];
8938 g = v[i][2];
8939 h = v[i][3];
8940
8941 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8942 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8943 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8944 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8945 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8946 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8947 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8948 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8949 }
8950
8951 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
8952#endif
8953}
8954
8955// Assist in expanding the AES cipher key by computing steps towards generating
8956// a round key for encryption cipher using data from a and an 8-bit round
8957// constant specified in imm8, and store the result in dst.
8958// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8959//
8960// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8961// This instruction generates a round key for AES encryption. See
8962// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8963// for details.
8964FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8965{
8966#if defined(__aarch64__)
8967 uint8x16_t _a = vreinterpretq_u8_m128i(a);
8968 uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
8969 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
8970 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
8971 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
8972
8973 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8974 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8975 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8976
8977 return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
8978
8979#else /* ARMv7-A NEON implementation */
8980 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
8981 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
8982 for (int i = 0; i < 4; ++i) {
8983 ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
8984 ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
8985 }
8986 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8987 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8988#endif
8989}
8990#undef SSE2NEON_AES_SBOX
8991#undef SSE2NEON_AES_RSBOX
8992
8993#if defined(__aarch64__)
8994#undef SSE2NEON_XT
8995#undef SSE2NEON_MULTIPLY
8996#endif
8997
8998#else /* __ARM_FEATURE_CRYPTO */
8999// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
9000// AESMC and then manually applying the real key as an xor operation. This
9001// unfortunately means an additional xor op; the compiler should be able to
9002// optimize this away for repeated calls however. See
9003// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
9004// for more details.
9005FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
9006{
9007 return vreinterpretq_m128i_u8(veorq_u8(
9008 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9009 vreinterpretq_u8_m128i(b)));
9010}
9011
9012// Perform one round of an AES decryption flow on data (state) in a using the
9013// round key in RoundKey, and store the result in dst.
9014// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
9015FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
9016{
9017 return vreinterpretq_m128i_u8(veorq_u8(
9018 vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9019 vreinterpretq_u8_m128i(RoundKey)));
9020}
9021
9022// Perform the last round of an AES encryption flow on data (state) in a using
9023// the round key in RoundKey, and store the result in dst.
9024// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
9025FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
9026{
9027 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
9028 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
9029 RoundKey);
9030}
9031
9032// Perform the last round of an AES decryption flow on data (state) in a using
9033// the round key in RoundKey, and store the result in dst.
9034// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
9035FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
9036{
9037 return vreinterpretq_m128i_u8(
9038 veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
9039 vreinterpretq_u8_m128i(RoundKey)));
9040}
9041
9042// Perform the InvMixColumns transformation on a and store the result in dst.
9043// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
9044FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
9045{
9046 return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
9047}
9048
9049// Assist in expanding the AES cipher key by computing steps towards generating
9050// a round key for encryption cipher using data from a and an 8-bit round
9051// constant specified in imm8, and store the result in dst."
9052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9053FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9054{
9055 // AESE does ShiftRows and SubBytes on A
9056 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9057
9058#ifndef _MSC_VER
9059 uint8x16_t dest = {
9060 // Undo ShiftRows step from AESE and extract X1 and X3
9061 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9062 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9063 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9064 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9065 };
9066 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9067 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
9068#else
9069 // We have to do this hack because MSVC is strictly adhering to the CPP
9070 // standard, in particular C++03 8.5.1 sub-section 15, which states that
9071 // unions must be initialized by their first member type.
9072
9073 // As per the Windows ARM64 ABI, it is always little endian, so this works
9074 __n128 dest{
9075 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9076 ((uint64_t) u8.n128_u8[0xE] << 16) |
9077 ((uint64_t) u8.n128_u8[0xB] << 24) |
9078 ((uint64_t) u8.n128_u8[0x1] << 32) |
9079 ((uint64_t) u8.n128_u8[0xE] << 40) |
9080 ((uint64_t) u8.n128_u8[0xB] << 48) |
9081 ((uint64_t) u8.n128_u8[0x4] << 56),
9082 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9083 ((uint64_t) u8.n128_u8[0x6] << 16) |
9084 ((uint64_t) u8.n128_u8[0x3] << 24) |
9085 ((uint64_t) u8.n128_u8[0x9] << 32) |
9086 ((uint64_t) u8.n128_u8[0x6] << 40) |
9087 ((uint64_t) u8.n128_u8[0x3] << 48) |
9088 ((uint64_t) u8.n128_u8[0xC] << 56)};
9089
9090 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9091 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9092
9093 return dest;
9094#endif
9095}
9096#endif
9097
9098/* Others */
9099
9100// Perform a carry-less multiplication of two 64-bit integers, selected from a
9101// and b according to imm8, and store the results in dst.
9102// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9103FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9104{
9105 uint64x2_t a = vreinterpretq_u64_m128i(_a);
9106 uint64x2_t b = vreinterpretq_u64_m128i(_b);
9107 switch (imm & 0x11) {
9108 case 0x00:
9109 return vreinterpretq_m128i_u64(
9110 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9111 case 0x01:
9112 return vreinterpretq_m128i_u64(
9113 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9114 case 0x10:
9115 return vreinterpretq_m128i_u64(
9116 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9117 case 0x11:
9118 return vreinterpretq_m128i_u64(
9119 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9120 default:
9121 abort();
9122 }
9123}
9124
9125FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9126{
9127 union {
9128 fpcr_bitfield field;
9129#if defined(__aarch64__) || defined(_M_ARM64)
9130 uint64_t value;
9131#else
9132 uint32_t value;
9133#endif
9134 } r;
9135
9136#if defined(__aarch64__) || defined(_M_ARM64)
9137 r.value = _sse2neon_get_fpcr();
9138#else
9139 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9140#endif
9141
9142 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9143}
9144
9145// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9146// return that count in dst.
9147// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9148FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9149{
9150#if defined(__aarch64__) || defined(_M_ARM64)
9151#if __has_builtin(__builtin_popcount)
9152 return __builtin_popcount(a);
9153#elif defined(_MSC_VER)
9154 return _CountOneBits(a);
9155#else
9156 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9157#endif
9158#else
9159 uint32_t count = 0;
9160 uint8x8_t input_val, count8x8_val;
9161 uint16x4_t count16x4_val;
9162 uint32x2_t count32x2_val;
9163
9164 input_val = vld1_u8((uint8_t *) &a);
9165 count8x8_val = vcnt_u8(input_val);
9166 count16x4_val = vpaddl_u8(count8x8_val);
9167 count32x2_val = vpaddl_u16(count16x4_val);
9168
9169 vst1_u32(&count, count32x2_val);
9170 return count;
9171#endif
9172}
9173
9174// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9175// return that count in dst.
9176// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9177FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9178{
9179#if defined(__aarch64__) || defined(_M_ARM64)
9180#if __has_builtin(__builtin_popcountll)
9181 return __builtin_popcountll(a);
9182#elif defined(_MSC_VER)
9183 return _CountOneBits64(a);
9184#else
9185 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9186#endif
9187#else
9188 uint64_t count = 0;
9189 uint8x8_t input_val, count8x8_val;
9190 uint16x4_t count16x4_val;
9191 uint32x2_t count32x2_val;
9192 uint64x1_t count64x1_val;
9193
9194 input_val = vld1_u8((uint8_t *) &a);
9195 count8x8_val = vcnt_u8(input_val);
9196 count16x4_val = vpaddl_u8(count8x8_val);
9197 count32x2_val = vpaddl_u16(count16x4_val);
9198 count64x1_val = vpaddl_u32(count32x2_val);
9199 vst1_u64(&count, count64x1_val);
9200 return count;
9201#endif
9202}
9203
9204FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9205{
9206 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9207 // regardless of the value of the FZ bit.
9208 union {
9209 fpcr_bitfield field;
9210#if defined(__aarch64__) || defined(_M_ARM64)
9211 uint64_t value;
9212#else
9213 uint32_t value;
9214#endif
9215 } r;
9216
9217#if defined(__aarch64__) || defined(_M_ARM64)
9218 r.value = _sse2neon_get_fpcr();
9219#else
9220 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9221#endif
9222
9223 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9224
9225#if defined(__aarch64__) || defined(_M_ARM64)
9226 _sse2neon_set_fpcr(r.value);
9227#else
9228 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
9229#endif
9230}
9231
9232// Return the current 64-bit value of the processor's time-stamp counter.
9233// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9234FORCE_INLINE uint64_t _rdtsc(void)
9235{
9236#if defined(__aarch64__) || defined(_M_ARM64)
9237 uint64_t val;
9238
9239 /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9240 * system counter is at least 56 bits wide; from Armv8.6, the counter
9241 * must be 64 bits wide. So the system counter could be less than 64
9242 * bits wide and it is attributed with the flag 'cap_user_time_short'
9243 * is true.
9244 */
9245#if defined(_MSC_VER)
9246 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9247#else
9248 __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9249#endif
9250
9251 return val;
9252#else
9253 uint32_t pmccntr, pmuseren, pmcntenset;
9254 // Read the user mode Performance Monitoring Unit (PMU)
9255 // User Enable Register (PMUSERENR) access permissions.
9256 __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9257 if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
9258 __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9259 if (pmcntenset & 0x80000000UL) { // Is it counting?
9260 __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9261 // The counter is set up to count every 64th cycle
9262 return (uint64_t) (pmccntr) << 6;
9263 }
9264 }
9265
9266 // Fallback to syscall as we can't enable PMUSERENR in user mode.
9267 struct timeval tv;
9268 gettimeofday(&tv, NULL);
9269 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9270#endif
9271}
9272
9273#if defined(__GNUC__) || defined(__clang__)
9274#pragma pop_macro("ALIGN_STRUCT")
9275#pragma pop_macro("FORCE_INLINE")
9276#endif
9277
9278#if defined(__GNUC__) && !defined(__clang__)
9279#pragma GCC pop_options
9280#endif
9281
9282#endif
diff --git a/src/android/app/src/main/AndroidManifest.xml b/src/android/app/src/main/AndroidManifest.xml
index f011bd696..7890b30ca 100755
--- a/src/android/app/src/main/AndroidManifest.xml
+++ b/src/android/app/src/main/AndroidManifest.xml
@@ -12,8 +12,6 @@ SPDX-License-Identifier: GPL-3.0-or-later
12 <uses-feature android:name="android.hardware.vulkan.version" android:version="0x401000" android:required="true" /> 12 <uses-feature android:name="android.hardware.vulkan.version" android:version="0x401000" android:required="true" />
13 13
14 <uses-permission android:name="android.permission.INTERNET" /> 14 <uses-permission android:name="android.permission.INTERNET" />
15 <uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
16 <uses-permission android:name="android.permission.FOREGROUND_SERVICE_SPECIAL_USE" />
17 <uses-permission android:name="android.permission.NFC" /> 15 <uses-permission android:name="android.permission.NFC" />
18 <uses-permission android:name="android.permission.POST_NOTIFICATIONS" /> 16 <uses-permission android:name="android.permission.POST_NOTIFICATIONS" />
19 17
@@ -80,10 +78,6 @@ SPDX-License-Identifier: GPL-3.0-or-later
80 android:resource="@xml/nfc_tech_filter" /> 78 android:resource="@xml/nfc_tech_filter" />
81 </activity> 79 </activity>
82 80
83 <service android:name="org.yuzu.yuzu_emu.utils.ForegroundService" android:foregroundServiceType="specialUse">
84 <property android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE" android:value="Keep emulation running in background"/>
85 </service>
86
87 <provider 81 <provider
88 android:name=".features.DocumentProvider" 82 android:name=".features.DocumentProvider"
89 android:authorities="${applicationId}.user" 83 android:authorities="${applicationId}.user"
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt
index d114bd53d..76778c10a 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt
@@ -17,17 +17,6 @@ fun Context.getPublicFilesDir(): File = getExternalFilesDir(null) ?: filesDir
17 17
18class YuzuApplication : Application() { 18class YuzuApplication : Application() {
19 private fun createNotificationChannels() { 19 private fun createNotificationChannels() {
20 val emulationChannel = NotificationChannel(
21 getString(R.string.emulation_notification_channel_id),
22 getString(R.string.emulation_notification_channel_name),
23 NotificationManager.IMPORTANCE_LOW
24 )
25 emulationChannel.description = getString(
26 R.string.emulation_notification_channel_description
27 )
28 emulationChannel.setSound(null, null)
29 emulationChannel.vibrationPattern = null
30
31 val noticeChannel = NotificationChannel( 20 val noticeChannel = NotificationChannel(
32 getString(R.string.notice_notification_channel_id), 21 getString(R.string.notice_notification_channel_id),
33 getString(R.string.notice_notification_channel_name), 22 getString(R.string.notice_notification_channel_name),
@@ -39,7 +28,6 @@ class YuzuApplication : Application() {
39 // Register the channel with the system; you can't change the importance 28 // Register the channel with the system; you can't change the importance
40 // or other notification behaviors after this 29 // or other notification behaviors after this
41 val notificationManager = getSystemService(NotificationManager::class.java) 30 val notificationManager = getSystemService(NotificationManager::class.java)
42 notificationManager.createNotificationChannel(emulationChannel)
43 notificationManager.createNotificationChannel(noticeChannel) 31 notificationManager.createNotificationChannel(noticeChannel)
44 } 32 }
45 33
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
index 564aaf305..7a8d03610 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
@@ -4,7 +4,6 @@
4package org.yuzu.yuzu_emu.activities 4package org.yuzu.yuzu_emu.activities
5 5
6import android.annotation.SuppressLint 6import android.annotation.SuppressLint
7import android.app.Activity
8import android.app.PendingIntent 7import android.app.PendingIntent
9import android.app.PictureInPictureParams 8import android.app.PictureInPictureParams
10import android.app.RemoteAction 9import android.app.RemoteAction
@@ -45,7 +44,6 @@ import org.yuzu.yuzu_emu.features.settings.model.IntSetting
45import org.yuzu.yuzu_emu.features.settings.model.Settings 44import org.yuzu.yuzu_emu.features.settings.model.Settings
46import org.yuzu.yuzu_emu.model.EmulationViewModel 45import org.yuzu.yuzu_emu.model.EmulationViewModel
47import org.yuzu.yuzu_emu.model.Game 46import org.yuzu.yuzu_emu.model.Game
48import org.yuzu.yuzu_emu.utils.ForegroundService
49import org.yuzu.yuzu_emu.utils.InputHandler 47import org.yuzu.yuzu_emu.utils.InputHandler
50import org.yuzu.yuzu_emu.utils.Log 48import org.yuzu.yuzu_emu.utils.Log
51import org.yuzu.yuzu_emu.utils.MemoryUtil 49import org.yuzu.yuzu_emu.utils.MemoryUtil
@@ -74,11 +72,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener {
74 72
75 private val emulationViewModel: EmulationViewModel by viewModels() 73 private val emulationViewModel: EmulationViewModel by viewModels()
76 74
77 override fun onDestroy() {
78 stopForegroundService(this)
79 super.onDestroy()
80 }
81
82 override fun onCreate(savedInstanceState: Bundle?) { 75 override fun onCreate(savedInstanceState: Bundle?) {
83 Log.gameLaunched = true 76 Log.gameLaunched = true
84 ThemeHelper.setTheme(this) 77 ThemeHelper.setTheme(this)
@@ -125,10 +118,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener {
125 .apply() 118 .apply()
126 } 119 }
127 } 120 }
128
129 // Start a foreground service to prevent the app from getting killed in the background
130 val startIntent = Intent(this, ForegroundService::class.java)
131 startForegroundService(startIntent)
132 } 121 }
133 122
134 override fun onKeyDown(keyCode: Int, event: KeyEvent): Boolean { 123 override fun onKeyDown(keyCode: Int, event: KeyEvent): Boolean {
@@ -481,12 +470,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener {
481 activity.startActivity(launcher) 470 activity.startActivity(launcher)
482 } 471 }
483 472
484 fun stopForegroundService(activity: Activity) {
485 val startIntent = Intent(activity, ForegroundService::class.java)
486 startIntent.action = ForegroundService.ACTION_STOP
487 activity.startForegroundService(startIntent)
488 }
489
490 private fun areCoordinatesOutside(view: View?, x: Float, y: Float): Boolean { 473 private fun areCoordinatesOutside(view: View?, x: Float, y: Float): Boolean {
491 if (view == null) { 474 if (view == null) {
492 return true 475 return true
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
index 86bd33672..664478472 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt
@@ -25,7 +25,8 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting {
25 HAPTIC_FEEDBACK("haptic_feedback"), 25 HAPTIC_FEEDBACK("haptic_feedback"),
26 SHOW_PERFORMANCE_OVERLAY("show_performance_overlay"), 26 SHOW_PERFORMANCE_OVERLAY("show_performance_overlay"),
27 SHOW_INPUT_OVERLAY("show_input_overlay"), 27 SHOW_INPUT_OVERLAY("show_input_overlay"),
28 TOUCHSCREEN("touchscreen"); 28 TOUCHSCREEN("touchscreen"),
29 SHOW_THERMAL_OVERLAY("show_thermal_overlay");
29 30
30 override fun getBoolean(needsGlobal: Boolean): Boolean = 31 override fun getBoolean(needsGlobal: Boolean): Boolean =
31 NativeConfig.getBoolean(key, needsGlobal) 32 NativeConfig.getBoolean(key, needsGlobal)
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt
index d7ab0b5d9..6f6e7be10 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt
@@ -8,7 +8,6 @@ import android.os.Bundle
8import android.view.LayoutInflater 8import android.view.LayoutInflater
9import android.view.View 9import android.view.View
10import android.view.ViewGroup 10import android.view.ViewGroup
11import android.view.ViewGroup.MarginLayoutParams
12import androidx.core.view.ViewCompat 11import androidx.core.view.ViewCompat
13import androidx.core.view.WindowInsetsCompat 12import androidx.core.view.WindowInsetsCompat
14import androidx.core.view.updatePadding 13import androidx.core.view.updatePadding
@@ -27,6 +26,7 @@ import org.yuzu.yuzu_emu.R
27import org.yuzu.yuzu_emu.databinding.FragmentSettingsBinding 26import org.yuzu.yuzu_emu.databinding.FragmentSettingsBinding
28import org.yuzu.yuzu_emu.features.settings.model.Settings 27import org.yuzu.yuzu_emu.features.settings.model.Settings
29import org.yuzu.yuzu_emu.model.SettingsViewModel 28import org.yuzu.yuzu_emu.model.SettingsViewModel
29import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
30 30
31class SettingsFragment : Fragment() { 31class SettingsFragment : Fragment() {
32 private lateinit var presenter: SettingsFragmentPresenter 32 private lateinit var presenter: SettingsFragmentPresenter
@@ -125,18 +125,10 @@ class SettingsFragment : Fragment() {
125 val leftInsets = barInsets.left + cutoutInsets.left 125 val leftInsets = barInsets.left + cutoutInsets.left
126 val rightInsets = barInsets.right + cutoutInsets.right 126 val rightInsets = barInsets.right + cutoutInsets.right
127 127
128 val mlpSettingsList = binding.listSettings.layoutParams as MarginLayoutParams 128 binding.listSettings.updateMargins(left = leftInsets, right = rightInsets)
129 mlpSettingsList.leftMargin = leftInsets 129 binding.listSettings.updatePadding(bottom = barInsets.bottom)
130 mlpSettingsList.rightMargin = rightInsets 130
131 binding.listSettings.layoutParams = mlpSettingsList 131 binding.appbarSettings.updateMargins(left = leftInsets, right = rightInsets)
132 binding.listSettings.updatePadding(
133 bottom = barInsets.bottom
134 )
135
136 val mlpAppBar = binding.appbarSettings.layoutParams as MarginLayoutParams
137 mlpAppBar.leftMargin = leftInsets
138 mlpAppBar.rightMargin = rightInsets
139 binding.appbarSettings.layoutParams = mlpAppBar
140 windowInsets 132 windowInsets
141 } 133 }
142 } 134 }
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt
index 5ab38ffda..ff4f0e5df 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt
@@ -13,7 +13,6 @@ import android.os.Bundle
13import android.view.LayoutInflater 13import android.view.LayoutInflater
14import android.view.View 14import android.view.View
15import android.view.ViewGroup 15import android.view.ViewGroup
16import android.view.ViewGroup.MarginLayoutParams
17import android.widget.Toast 16import android.widget.Toast
18import androidx.core.view.ViewCompat 17import androidx.core.view.ViewCompat
19import androidx.core.view.WindowInsetsCompat 18import androidx.core.view.WindowInsetsCompat
@@ -26,6 +25,7 @@ import org.yuzu.yuzu_emu.BuildConfig
26import org.yuzu.yuzu_emu.R 25import org.yuzu.yuzu_emu.R
27import org.yuzu.yuzu_emu.databinding.FragmentAboutBinding 26import org.yuzu.yuzu_emu.databinding.FragmentAboutBinding
28import org.yuzu.yuzu_emu.model.HomeViewModel 27import org.yuzu.yuzu_emu.model.HomeViewModel
28import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
29 29
30class AboutFragment : Fragment() { 30class AboutFragment : Fragment() {
31 private var _binding: FragmentAboutBinding? = null 31 private var _binding: FragmentAboutBinding? = null
@@ -114,15 +114,8 @@ class AboutFragment : Fragment() {
114 val leftInsets = barInsets.left + cutoutInsets.left 114 val leftInsets = barInsets.left + cutoutInsets.left
115 val rightInsets = barInsets.right + cutoutInsets.right 115 val rightInsets = barInsets.right + cutoutInsets.right
116 116
117 val mlpToolbar = binding.toolbarAbout.layoutParams as MarginLayoutParams 117 binding.toolbarAbout.updateMargins(left = leftInsets, right = rightInsets)
118 mlpToolbar.leftMargin = leftInsets 118 binding.scrollAbout.updateMargins(left = leftInsets, right = rightInsets)
119 mlpToolbar.rightMargin = rightInsets
120 binding.toolbarAbout.layoutParams = mlpToolbar
121
122 val mlpScrollAbout = binding.scrollAbout.layoutParams as MarginLayoutParams
123 mlpScrollAbout.leftMargin = leftInsets
124 mlpScrollAbout.rightMargin = rightInsets
125 binding.scrollAbout.layoutParams = mlpScrollAbout
126 119
127 binding.contentAbout.updatePadding(bottom = barInsets.bottom) 120 binding.contentAbout.updatePadding(bottom = barInsets.bottom)
128 121
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
index adb65812c..f5647fa95 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
@@ -31,6 +31,7 @@ import org.yuzu.yuzu_emu.model.AddonViewModel
31import org.yuzu.yuzu_emu.model.HomeViewModel 31import org.yuzu.yuzu_emu.model.HomeViewModel
32import org.yuzu.yuzu_emu.utils.AddonUtil 32import org.yuzu.yuzu_emu.utils.AddonUtil
33import org.yuzu.yuzu_emu.utils.FileUtil.copyFilesTo 33import org.yuzu.yuzu_emu.utils.FileUtil.copyFilesTo
34import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
34import java.io.File 35import java.io.File
35 36
36class AddonsFragment : Fragment() { 37class AddonsFragment : Fragment() {
@@ -202,27 +203,19 @@ class AddonsFragment : Fragment() {
202 val leftInsets = barInsets.left + cutoutInsets.left 203 val leftInsets = barInsets.left + cutoutInsets.left
203 val rightInsets = barInsets.right + cutoutInsets.right 204 val rightInsets = barInsets.right + cutoutInsets.right
204 205
205 val mlpToolbar = binding.toolbarAddons.layoutParams as ViewGroup.MarginLayoutParams 206 binding.toolbarAddons.updateMargins(left = leftInsets, right = rightInsets)
206 mlpToolbar.leftMargin = leftInsets 207 binding.listAddons.updateMargins(left = leftInsets, right = rightInsets)
207 mlpToolbar.rightMargin = rightInsets
208 binding.toolbarAddons.layoutParams = mlpToolbar
209
210 val mlpAddonsList = binding.listAddons.layoutParams as ViewGroup.MarginLayoutParams
211 mlpAddonsList.leftMargin = leftInsets
212 mlpAddonsList.rightMargin = rightInsets
213 binding.listAddons.layoutParams = mlpAddonsList
214 binding.listAddons.updatePadding( 208 binding.listAddons.updatePadding(
215 bottom = barInsets.bottom + 209 bottom = barInsets.bottom +
216 resources.getDimensionPixelSize(R.dimen.spacing_bottom_list_fab) 210 resources.getDimensionPixelSize(R.dimen.spacing_bottom_list_fab)
217 ) 211 )
218 212
219 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) 213 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab)
220 val mlpFab = 214 binding.buttonInstall.updateMargins(
221 binding.buttonInstall.layoutParams as ViewGroup.MarginLayoutParams 215 left = leftInsets + fabSpacing,
222 mlpFab.leftMargin = leftInsets + fabSpacing 216 right = rightInsets + fabSpacing,
223 mlpFab.rightMargin = rightInsets + fabSpacing 217 bottom = barInsets.bottom + fabSpacing
224 mlpFab.bottomMargin = barInsets.bottom + fabSpacing 218 )
225 binding.buttonInstall.layoutParams = mlpFab
226 219
227 windowInsets 220 windowInsets
228 } 221 }
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt
index 1f66b440d..73ca40484 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt
@@ -21,6 +21,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentAppletLauncherBinding
21import org.yuzu.yuzu_emu.model.Applet 21import org.yuzu.yuzu_emu.model.Applet
22import org.yuzu.yuzu_emu.model.AppletInfo 22import org.yuzu.yuzu_emu.model.AppletInfo
23import org.yuzu.yuzu_emu.model.HomeViewModel 23import org.yuzu.yuzu_emu.model.HomeViewModel
24import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
24 25
25class AppletLauncherFragment : Fragment() { 26class AppletLauncherFragment : Fragment() {
26 private var _binding: FragmentAppletLauncherBinding? = null 27 private var _binding: FragmentAppletLauncherBinding? = null
@@ -95,16 +96,8 @@ class AppletLauncherFragment : Fragment() {
95 val leftInsets = barInsets.left + cutoutInsets.left 96 val leftInsets = barInsets.left + cutoutInsets.left
96 val rightInsets = barInsets.right + cutoutInsets.right 97 val rightInsets = barInsets.right + cutoutInsets.right
97 98
98 val mlpAppBar = binding.toolbarApplets.layoutParams as ViewGroup.MarginLayoutParams 99 binding.toolbarApplets.updateMargins(left = leftInsets, right = rightInsets)
99 mlpAppBar.leftMargin = leftInsets 100 binding.listApplets.updateMargins(left = leftInsets, right = rightInsets)
100 mlpAppBar.rightMargin = rightInsets
101 binding.toolbarApplets.layoutParams = mlpAppBar
102
103 val mlpListApplets =
104 binding.listApplets.layoutParams as ViewGroup.MarginLayoutParams
105 mlpListApplets.leftMargin = leftInsets
106 mlpListApplets.rightMargin = rightInsets
107 binding.listApplets.layoutParams = mlpListApplets
108 101
109 binding.listApplets.updatePadding(bottom = barInsets.bottom) 102 binding.listApplets.updatePadding(bottom = barInsets.bottom)
110 103
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
index bf017cd7c..41cff46c1 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
@@ -34,6 +34,7 @@ import org.yuzu.yuzu_emu.model.HomeViewModel
34import org.yuzu.yuzu_emu.utils.FileUtil 34import org.yuzu.yuzu_emu.utils.FileUtil
35import org.yuzu.yuzu_emu.utils.GpuDriverHelper 35import org.yuzu.yuzu_emu.utils.GpuDriverHelper
36import org.yuzu.yuzu_emu.utils.NativeConfig 36import org.yuzu.yuzu_emu.utils.NativeConfig
37import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
37import java.io.File 38import java.io.File
38import java.io.IOException 39import java.io.IOException
39 40
@@ -141,23 +142,15 @@ class DriverManagerFragment : Fragment() {
141 val leftInsets = barInsets.left + cutoutInsets.left 142 val leftInsets = barInsets.left + cutoutInsets.left
142 val rightInsets = barInsets.right + cutoutInsets.right 143 val rightInsets = barInsets.right + cutoutInsets.right
143 144
144 val mlpAppBar = binding.toolbarDrivers.layoutParams as ViewGroup.MarginLayoutParams 145 binding.toolbarDrivers.updateMargins(left = leftInsets, right = rightInsets)
145 mlpAppBar.leftMargin = leftInsets 146 binding.listDrivers.updateMargins(left = leftInsets, right = rightInsets)
146 mlpAppBar.rightMargin = rightInsets
147 binding.toolbarDrivers.layoutParams = mlpAppBar
148
149 val mlplistDrivers = binding.listDrivers.layoutParams as ViewGroup.MarginLayoutParams
150 mlplistDrivers.leftMargin = leftInsets
151 mlplistDrivers.rightMargin = rightInsets
152 binding.listDrivers.layoutParams = mlplistDrivers
153 147
154 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) 148 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab)
155 val mlpFab = 149 binding.buttonInstall.updateMargins(
156 binding.buttonInstall.layoutParams as ViewGroup.MarginLayoutParams 150 left = leftInsets + fabSpacing,
157 mlpFab.leftMargin = leftInsets + fabSpacing 151 right = rightInsets + fabSpacing,
158 mlpFab.rightMargin = rightInsets + fabSpacing 152 bottom = barInsets.bottom + fabSpacing
159 mlpFab.bottomMargin = barInsets.bottom + fabSpacing 153 )
160 binding.buttonInstall.layoutParams = mlpFab
161 154
162 binding.listDrivers.updatePadding( 155 binding.listDrivers.updatePadding(
163 bottom = barInsets.bottom + 156 bottom = barInsets.bottom +
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt
index dbc16da4a..0534b68ce 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt
@@ -19,6 +19,7 @@ import com.google.android.material.transition.MaterialSharedAxis
19import org.yuzu.yuzu_emu.R 19import org.yuzu.yuzu_emu.R
20import org.yuzu.yuzu_emu.databinding.FragmentEarlyAccessBinding 20import org.yuzu.yuzu_emu.databinding.FragmentEarlyAccessBinding
21import org.yuzu.yuzu_emu.model.HomeViewModel 21import org.yuzu.yuzu_emu.model.HomeViewModel
22import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
22 23
23class EarlyAccessFragment : Fragment() { 24class EarlyAccessFragment : Fragment() {
24 private var _binding: FragmentEarlyAccessBinding? = null 25 private var _binding: FragmentEarlyAccessBinding? = null
@@ -73,10 +74,7 @@ class EarlyAccessFragment : Fragment() {
73 val leftInsets = barInsets.left + cutoutInsets.left 74 val leftInsets = barInsets.left + cutoutInsets.left
74 val rightInsets = barInsets.right + cutoutInsets.right 75 val rightInsets = barInsets.right + cutoutInsets.right
75 76
76 val mlpAppBar = binding.appbarEa.layoutParams as ViewGroup.MarginLayoutParams 77 binding.appbarEa.updateMargins(left = leftInsets, right = rightInsets)
77 mlpAppBar.leftMargin = leftInsets
78 mlpAppBar.rightMargin = rightInsets
79 binding.appbarEa.layoutParams = mlpAppBar
80 78
81 binding.scrollEa.updatePadding( 79 binding.scrollEa.updatePadding(
82 left = leftInsets, 80 left = leftInsets,
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
index 937b8faf1..44af896da 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
@@ -13,6 +13,7 @@ import android.net.Uri
13import android.os.Bundle 13import android.os.Bundle
14import android.os.Handler 14import android.os.Handler
15import android.os.Looper 15import android.os.Looper
16import android.os.PowerManager
16import android.os.SystemClock 17import android.os.SystemClock
17import android.view.* 18import android.view.*
18import android.widget.TextView 19import android.widget.TextView
@@ -23,6 +24,7 @@ import androidx.core.content.res.ResourcesCompat
23import androidx.core.graphics.Insets 24import androidx.core.graphics.Insets
24import androidx.core.view.ViewCompat 25import androidx.core.view.ViewCompat
25import androidx.core.view.WindowInsetsCompat 26import androidx.core.view.WindowInsetsCompat
27import androidx.core.view.updatePadding
26import androidx.drawerlayout.widget.DrawerLayout 28import androidx.drawerlayout.widget.DrawerLayout
27import androidx.drawerlayout.widget.DrawerLayout.DrawerListener 29import androidx.drawerlayout.widget.DrawerLayout.DrawerListener
28import androidx.fragment.app.Fragment 30import androidx.fragment.app.Fragment
@@ -38,7 +40,6 @@ import androidx.window.layout.WindowLayoutInfo
38import com.google.android.material.dialog.MaterialAlertDialogBuilder 40import com.google.android.material.dialog.MaterialAlertDialogBuilder
39import com.google.android.material.slider.Slider 41import com.google.android.material.slider.Slider
40import kotlinx.coroutines.Dispatchers 42import kotlinx.coroutines.Dispatchers
41import kotlinx.coroutines.flow.collect
42import kotlinx.coroutines.flow.collectLatest 43import kotlinx.coroutines.flow.collectLatest
43import kotlinx.coroutines.launch 44import kotlinx.coroutines.launch
44import org.yuzu.yuzu_emu.HomeNavigationDirections 45import org.yuzu.yuzu_emu.HomeNavigationDirections
@@ -64,6 +65,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
64 private lateinit var emulationState: EmulationState 65 private lateinit var emulationState: EmulationState
65 private var emulationActivity: EmulationActivity? = null 66 private var emulationActivity: EmulationActivity? = null
66 private var perfStatsUpdater: (() -> Unit)? = null 67 private var perfStatsUpdater: (() -> Unit)? = null
68 private var thermalStatsUpdater: (() -> Unit)? = null
67 69
68 private var _binding: FragmentEmulationBinding? = null 70 private var _binding: FragmentEmulationBinding? = null
69 private val binding get() = _binding!! 71 private val binding get() = _binding!!
@@ -77,6 +79,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
77 79
78 private var isInFoldableLayout = false 80 private var isInFoldableLayout = false
79 81
82 private lateinit var powerManager: PowerManager
83
80 override fun onAttach(context: Context) { 84 override fun onAttach(context: Context) {
81 super.onAttach(context) 85 super.onAttach(context)
82 if (context is EmulationActivity) { 86 if (context is EmulationActivity) {
@@ -102,6 +106,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
102 super.onCreate(savedInstanceState) 106 super.onCreate(savedInstanceState)
103 updateOrientation() 107 updateOrientation()
104 108
109 powerManager = requireContext().getSystemService(Context.POWER_SERVICE) as PowerManager
110
105 val intentUri: Uri? = requireActivity().intent.data 111 val intentUri: Uri? = requireActivity().intent.data
106 var intentGame: Game? = null 112 var intentGame: Game? = null
107 if (intentUri != null) { 113 if (intentUri != null) {
@@ -394,8 +400,9 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
394 400
395 emulationState.updateSurface() 401 emulationState.updateSurface()
396 402
397 // Setup overlay 403 // Setup overlays
398 updateShowFpsOverlay() 404 updateShowFpsOverlay()
405 updateThermalOverlay()
399 } 406 }
400 } 407 }
401 } 408 }
@@ -553,6 +560,38 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
553 } 560 }
554 } 561 }
555 562
563 private fun updateThermalOverlay() {
564 if (BooleanSetting.SHOW_THERMAL_OVERLAY.getBoolean()) {
565 thermalStatsUpdater = {
566 if (emulationViewModel.emulationStarted.value &&
567 !emulationViewModel.isEmulationStopping.value
568 ) {
569 val thermalStatus = when (powerManager.currentThermalStatus) {
570 PowerManager.THERMAL_STATUS_LIGHT -> "😥"
571 PowerManager.THERMAL_STATUS_MODERATE -> "🥵"
572 PowerManager.THERMAL_STATUS_SEVERE -> "🔥"
573 PowerManager.THERMAL_STATUS_CRITICAL,
574 PowerManager.THERMAL_STATUS_EMERGENCY,
575 PowerManager.THERMAL_STATUS_SHUTDOWN -> "☢️"
576
577 else -> "🙂"
578 }
579 if (_binding != null) {
580 binding.showThermalsText.text = thermalStatus
581 }
582 thermalStatsUpdateHandler.postDelayed(thermalStatsUpdater!!, 1000)
583 }
584 }
585 thermalStatsUpdateHandler.post(thermalStatsUpdater!!)
586 binding.showThermalsText.visibility = View.VISIBLE
587 } else {
588 if (thermalStatsUpdater != null) {
589 thermalStatsUpdateHandler.removeCallbacks(thermalStatsUpdater!!)
590 }
591 binding.showThermalsText.visibility = View.GONE
592 }
593 }
594
556 @SuppressLint("SourceLockedOrientationActivity") 595 @SuppressLint("SourceLockedOrientationActivity")
557 private fun updateOrientation() { 596 private fun updateOrientation() {
558 emulationActivity?.let { 597 emulationActivity?.let {
@@ -641,6 +680,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
641 popup.menu.apply { 680 popup.menu.apply {
642 findItem(R.id.menu_toggle_fps).isChecked = 681 findItem(R.id.menu_toggle_fps).isChecked =
643 BooleanSetting.SHOW_PERFORMANCE_OVERLAY.getBoolean() 682 BooleanSetting.SHOW_PERFORMANCE_OVERLAY.getBoolean()
683 findItem(R.id.thermal_indicator).isChecked =
684 BooleanSetting.SHOW_THERMAL_OVERLAY.getBoolean()
644 findItem(R.id.menu_rel_stick_center).isChecked = 685 findItem(R.id.menu_rel_stick_center).isChecked =
645 BooleanSetting.JOYSTICK_REL_CENTER.getBoolean() 686 BooleanSetting.JOYSTICK_REL_CENTER.getBoolean()
646 findItem(R.id.menu_dpad_slide).isChecked = BooleanSetting.DPAD_SLIDE.getBoolean() 687 findItem(R.id.menu_dpad_slide).isChecked = BooleanSetting.DPAD_SLIDE.getBoolean()
@@ -660,6 +701,13 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
660 true 701 true
661 } 702 }
662 703
704 R.id.thermal_indicator -> {
705 it.isChecked = !it.isChecked
706 BooleanSetting.SHOW_THERMAL_OVERLAY.setBoolean(it.isChecked)
707 updateThermalOverlay()
708 true
709 }
710
663 R.id.menu_edit_overlay -> { 711 R.id.menu_edit_overlay -> {
664 binding.drawerLayout.close() 712 binding.drawerLayout.close()
665 binding.surfaceInputOverlay.requestFocus() 713 binding.surfaceInputOverlay.requestFocus()
@@ -850,7 +898,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
850 right = cutInsets.right 898 right = cutInsets.right
851 } 899 }
852 900
853 v.setPadding(left, cutInsets.top, right, 0) 901 v.updatePadding(left = left, top = cutInsets.top, right = right)
854 windowInsets 902 windowInsets
855 } 903 }
856 } 904 }
@@ -1003,5 +1051,6 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
1003 1051
1004 companion object { 1052 companion object {
1005 private val perfStatsUpdateHandler = Handler(Looper.myLooper()!!) 1053 private val perfStatsUpdateHandler = Handler(Looper.myLooper()!!)
1054 private val thermalStatsUpdateHandler = Handler(Looper.myLooper()!!)
1006 } 1055 }
1007} 1056}
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt
index 341a37fdb..5c558b1a5 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt
@@ -26,6 +26,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentFoldersBinding
26import org.yuzu.yuzu_emu.model.GamesViewModel 26import org.yuzu.yuzu_emu.model.GamesViewModel
27import org.yuzu.yuzu_emu.model.HomeViewModel 27import org.yuzu.yuzu_emu.model.HomeViewModel
28import org.yuzu.yuzu_emu.ui.main.MainActivity 28import org.yuzu.yuzu_emu.ui.main.MainActivity
29import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
29 30
30class GameFoldersFragment : Fragment() { 31class GameFoldersFragment : Fragment() {
31 private var _binding: FragmentFoldersBinding? = null 32 private var _binding: FragmentFoldersBinding? = null
@@ -100,23 +101,16 @@ class GameFoldersFragment : Fragment() {
100 val leftInsets = barInsets.left + cutoutInsets.left 101 val leftInsets = barInsets.left + cutoutInsets.left
101 val rightInsets = barInsets.right + cutoutInsets.right 102 val rightInsets = barInsets.right + cutoutInsets.right
102 103
103 val mlpToolbar = binding.toolbarFolders.layoutParams as ViewGroup.MarginLayoutParams 104 binding.toolbarFolders.updateMargins(left = leftInsets, right = rightInsets)
104 mlpToolbar.leftMargin = leftInsets
105 mlpToolbar.rightMargin = rightInsets
106 binding.toolbarFolders.layoutParams = mlpToolbar
107 105
108 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) 106 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab)
109 val mlpFab = 107 binding.buttonAdd.updateMargins(
110 binding.buttonAdd.layoutParams as ViewGroup.MarginLayoutParams 108 left = leftInsets + fabSpacing,
111 mlpFab.leftMargin = leftInsets + fabSpacing 109 right = rightInsets + fabSpacing,
112 mlpFab.rightMargin = rightInsets + fabSpacing 110 bottom = barInsets.bottom + fabSpacing
113 mlpFab.bottomMargin = barInsets.bottom + fabSpacing 111 )
114 binding.buttonAdd.layoutParams = mlpFab 112
115 113 binding.listFolders.updateMargins(left = leftInsets, right = rightInsets)
116 val mlpListFolders = binding.listFolders.layoutParams as ViewGroup.MarginLayoutParams
117 mlpListFolders.leftMargin = leftInsets
118 mlpListFolders.rightMargin = rightInsets
119 binding.listFolders.layoutParams = mlpListFolders
120 114
121 binding.listFolders.updatePadding( 115 binding.listFolders.updatePadding(
122 bottom = barInsets.bottom + 116 bottom = barInsets.bottom +
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt
index 5aa3f453f..dbd56e84f 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt
@@ -27,6 +27,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentGameInfoBinding
27import org.yuzu.yuzu_emu.model.GameVerificationResult 27import org.yuzu.yuzu_emu.model.GameVerificationResult
28import org.yuzu.yuzu_emu.model.HomeViewModel 28import org.yuzu.yuzu_emu.model.HomeViewModel
29import org.yuzu.yuzu_emu.utils.GameMetadata 29import org.yuzu.yuzu_emu.utils.GameMetadata
30import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
30 31
31class GameInfoFragment : Fragment() { 32class GameInfoFragment : Fragment() {
32 private var _binding: FragmentGameInfoBinding? = null 33 private var _binding: FragmentGameInfoBinding? = null
@@ -122,11 +123,13 @@ class GameInfoFragment : Fragment() {
122 titleId = R.string.verify_success, 123 titleId = R.string.verify_success,
123 descriptionId = R.string.operation_completed_successfully 124 descriptionId = R.string.operation_completed_successfully
124 ) 125 )
126
125 GameVerificationResult.Failed -> 127 GameVerificationResult.Failed ->
126 MessageDialogFragment.newInstance( 128 MessageDialogFragment.newInstance(
127 titleId = R.string.verify_failure, 129 titleId = R.string.verify_failure,
128 descriptionId = R.string.verify_failure_description 130 descriptionId = R.string.verify_failure_description
129 ) 131 )
132
130 GameVerificationResult.NotImplemented -> 133 GameVerificationResult.NotImplemented ->
131 MessageDialogFragment.newInstance( 134 MessageDialogFragment.newInstance(
132 titleId = R.string.verify_no_result, 135 titleId = R.string.verify_no_result,
@@ -165,15 +168,8 @@ class GameInfoFragment : Fragment() {
165 val leftInsets = barInsets.left + cutoutInsets.left 168 val leftInsets = barInsets.left + cutoutInsets.left
166 val rightInsets = barInsets.right + cutoutInsets.right 169 val rightInsets = barInsets.right + cutoutInsets.right
167 170
168 val mlpToolbar = binding.toolbarInfo.layoutParams as ViewGroup.MarginLayoutParams 171 binding.toolbarInfo.updateMargins(left = leftInsets, right = rightInsets)
169 mlpToolbar.leftMargin = leftInsets 172 binding.scrollInfo.updateMargins(left = leftInsets, right = rightInsets)
170 mlpToolbar.rightMargin = rightInsets
171 binding.toolbarInfo.layoutParams = mlpToolbar
172
173 val mlpScrollAbout = binding.scrollInfo.layoutParams as ViewGroup.MarginLayoutParams
174 mlpScrollAbout.leftMargin = leftInsets
175 mlpScrollAbout.rightMargin = rightInsets
176 binding.scrollInfo.layoutParams = mlpScrollAbout
177 173
178 binding.contentInfo.updatePadding(bottom = barInsets.bottom) 174 binding.contentInfo.updatePadding(bottom = barInsets.bottom)
179 175
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
index 582df0133..d14b2c634 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
@@ -46,6 +46,7 @@ import org.yuzu.yuzu_emu.utils.FileUtil
46import org.yuzu.yuzu_emu.utils.GameIconUtils 46import org.yuzu.yuzu_emu.utils.GameIconUtils
47import org.yuzu.yuzu_emu.utils.GpuDriverHelper 47import org.yuzu.yuzu_emu.utils.GpuDriverHelper
48import org.yuzu.yuzu_emu.utils.MemoryUtil 48import org.yuzu.yuzu_emu.utils.MemoryUtil
49import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
49import java.io.BufferedOutputStream 50import java.io.BufferedOutputStream
50import java.io.File 51import java.io.File
51 52
@@ -320,46 +321,25 @@ class GamePropertiesFragment : Fragment() {
320 321
321 val smallLayout = resources.getBoolean(R.bool.small_layout) 322 val smallLayout = resources.getBoolean(R.bool.small_layout)
322 if (smallLayout) { 323 if (smallLayout) {
323 val mlpListAll = 324 binding.listAll.updateMargins(left = leftInsets, right = rightInsets)
324 binding.listAll.layoutParams as ViewGroup.MarginLayoutParams
325 mlpListAll.leftMargin = leftInsets
326 mlpListAll.rightMargin = rightInsets
327 binding.listAll.layoutParams = mlpListAll
328 } else { 325 } else {
329 if (ViewCompat.getLayoutDirection(binding.root) == 326 if (ViewCompat.getLayoutDirection(binding.root) ==
330 ViewCompat.LAYOUT_DIRECTION_LTR 327 ViewCompat.LAYOUT_DIRECTION_LTR
331 ) { 328 ) {
332 val mlpListAll = 329 binding.listAll.updateMargins(right = rightInsets)
333 binding.listAll.layoutParams as ViewGroup.MarginLayoutParams 330 binding.iconLayout!!.updateMargins(top = barInsets.top, left = leftInsets)
334 mlpListAll.rightMargin = rightInsets
335 binding.listAll.layoutParams = mlpListAll
336
337 val mlpIconLayout =
338 binding.iconLayout!!.layoutParams as ViewGroup.MarginLayoutParams
339 mlpIconLayout.topMargin = barInsets.top
340 mlpIconLayout.leftMargin = leftInsets
341 binding.iconLayout!!.layoutParams = mlpIconLayout
342 } else { 331 } else {
343 val mlpListAll = 332 binding.listAll.updateMargins(left = leftInsets)
344 binding.listAll.layoutParams as ViewGroup.MarginLayoutParams 333 binding.iconLayout!!.updateMargins(top = barInsets.top, right = rightInsets)
345 mlpListAll.leftMargin = leftInsets
346 binding.listAll.layoutParams = mlpListAll
347
348 val mlpIconLayout =
349 binding.iconLayout!!.layoutParams as ViewGroup.MarginLayoutParams
350 mlpIconLayout.topMargin = barInsets.top
351 mlpIconLayout.rightMargin = rightInsets
352 binding.iconLayout!!.layoutParams = mlpIconLayout
353 } 334 }
354 } 335 }
355 336
356 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) 337 val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab)
357 val mlpFab = 338 binding.buttonStart.updateMargins(
358 binding.buttonStart.layoutParams as ViewGroup.MarginLayoutParams 339 left = leftInsets + fabSpacing,
359 mlpFab.leftMargin = leftInsets + fabSpacing 340 right = rightInsets + fabSpacing,
360 mlpFab.rightMargin = rightInsets + fabSpacing 341 bottom = barInsets.bottom + fabSpacing
361 mlpFab.bottomMargin = barInsets.bottom + fabSpacing 342 )
362 binding.buttonStart.layoutParams = mlpFab
363 343
364 binding.layoutAll.updatePadding( 344 binding.layoutAll.updatePadding(
365 top = barInsets.top, 345 top = barInsets.top,
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt
index 1f3578b22..87e130d3e 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt
@@ -12,7 +12,6 @@ import android.provider.DocumentsContract
12import android.view.LayoutInflater 12import android.view.LayoutInflater
13import android.view.View 13import android.view.View
14import android.view.ViewGroup 14import android.view.ViewGroup
15import android.view.ViewGroup.MarginLayoutParams
16import android.widget.Toast 15import android.widget.Toast
17import androidx.appcompat.app.AppCompatActivity 16import androidx.appcompat.app.AppCompatActivity
18import androidx.core.app.ActivityCompat 17import androidx.core.app.ActivityCompat
@@ -44,6 +43,7 @@ import org.yuzu.yuzu_emu.ui.main.MainActivity
44import org.yuzu.yuzu_emu.utils.FileUtil 43import org.yuzu.yuzu_emu.utils.FileUtil
45import org.yuzu.yuzu_emu.utils.GpuDriverHelper 44import org.yuzu.yuzu_emu.utils.GpuDriverHelper
46import org.yuzu.yuzu_emu.utils.Log 45import org.yuzu.yuzu_emu.utils.Log
46import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
47 47
48class HomeSettingsFragment : Fragment() { 48class HomeSettingsFragment : Fragment() {
49 private var _binding: FragmentHomeSettingsBinding? = null 49 private var _binding: FragmentHomeSettingsBinding? = null
@@ -408,10 +408,7 @@ class HomeSettingsFragment : Fragment() {
408 bottom = barInsets.bottom 408 bottom = barInsets.bottom
409 ) 409 )
410 410
411 val mlpScrollSettings = binding.scrollViewSettings.layoutParams as MarginLayoutParams 411 binding.scrollViewSettings.updateMargins(left = leftInsets, right = rightInsets)
412 mlpScrollSettings.leftMargin = leftInsets
413 mlpScrollSettings.rightMargin = rightInsets
414 binding.scrollViewSettings.layoutParams = mlpScrollSettings
415 412
416 binding.linearLayoutSettings.updatePadding(bottom = spacingNavigation) 413 binding.linearLayoutSettings.updatePadding(bottom = spacingNavigation)
417 414
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
index 7df8e6bf4..63112dc6f 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
@@ -34,6 +34,7 @@ import org.yuzu.yuzu_emu.model.TaskState
34import org.yuzu.yuzu_emu.ui.main.MainActivity 34import org.yuzu.yuzu_emu.ui.main.MainActivity
35import org.yuzu.yuzu_emu.utils.DirectoryInitialization 35import org.yuzu.yuzu_emu.utils.DirectoryInitialization
36import org.yuzu.yuzu_emu.utils.FileUtil 36import org.yuzu.yuzu_emu.utils.FileUtil
37import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
37import java.io.BufferedOutputStream 38import java.io.BufferedOutputStream
38import java.io.File 39import java.io.File
39import java.math.BigInteger 40import java.math.BigInteger
@@ -172,16 +173,8 @@ class InstallableFragment : Fragment() {
172 val leftInsets = barInsets.left + cutoutInsets.left 173 val leftInsets = barInsets.left + cutoutInsets.left
173 val rightInsets = barInsets.right + cutoutInsets.right 174 val rightInsets = barInsets.right + cutoutInsets.right
174 175
175 val mlpAppBar = binding.toolbarInstallables.layoutParams as ViewGroup.MarginLayoutParams 176 binding.toolbarInstallables.updateMargins(left = leftInsets, right = rightInsets)
176 mlpAppBar.leftMargin = leftInsets 177 binding.listInstallables.updateMargins(left = leftInsets, right = rightInsets)
177 mlpAppBar.rightMargin = rightInsets
178 binding.toolbarInstallables.layoutParams = mlpAppBar
179
180 val mlpScrollAbout =
181 binding.listInstallables.layoutParams as ViewGroup.MarginLayoutParams
182 mlpScrollAbout.leftMargin = leftInsets
183 mlpScrollAbout.rightMargin = rightInsets
184 binding.listInstallables.layoutParams = mlpScrollAbout
185 178
186 binding.listInstallables.updatePadding(bottom = barInsets.bottom) 179 binding.listInstallables.updatePadding(bottom = barInsets.bottom)
187 180
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt
index b6e9129f7..f17f621f8 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt
@@ -7,7 +7,6 @@ import android.os.Bundle
7import android.view.LayoutInflater 7import android.view.LayoutInflater
8import android.view.View 8import android.view.View
9import android.view.ViewGroup 9import android.view.ViewGroup
10import android.view.ViewGroup.MarginLayoutParams
11import androidx.appcompat.app.AppCompatActivity 10import androidx.appcompat.app.AppCompatActivity
12import androidx.core.view.ViewCompat 11import androidx.core.view.ViewCompat
13import androidx.core.view.WindowInsetsCompat 12import androidx.core.view.WindowInsetsCompat
@@ -22,6 +21,7 @@ import org.yuzu.yuzu_emu.adapters.LicenseAdapter
22import org.yuzu.yuzu_emu.databinding.FragmentLicensesBinding 21import org.yuzu.yuzu_emu.databinding.FragmentLicensesBinding
23import org.yuzu.yuzu_emu.model.HomeViewModel 22import org.yuzu.yuzu_emu.model.HomeViewModel
24import org.yuzu.yuzu_emu.model.License 23import org.yuzu.yuzu_emu.model.License
24import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
25 25
26class LicensesFragment : Fragment() { 26class LicensesFragment : Fragment() {
27 private var _binding: FragmentLicensesBinding? = null 27 private var _binding: FragmentLicensesBinding? = null
@@ -122,15 +122,8 @@ class LicensesFragment : Fragment() {
122 val leftInsets = barInsets.left + cutoutInsets.left 122 val leftInsets = barInsets.left + cutoutInsets.left
123 val rightInsets = barInsets.right + cutoutInsets.right 123 val rightInsets = barInsets.right + cutoutInsets.right
124 124
125 val mlpAppBar = binding.appbarLicenses.layoutParams as MarginLayoutParams 125 binding.appbarLicenses.updateMargins(left = leftInsets, right = rightInsets)
126 mlpAppBar.leftMargin = leftInsets 126 binding.listLicenses.updateMargins(left = leftInsets, right = rightInsets)
127 mlpAppBar.rightMargin = rightInsets
128 binding.appbarLicenses.layoutParams = mlpAppBar
129
130 val mlpScrollAbout = binding.listLicenses.layoutParams as MarginLayoutParams
131 mlpScrollAbout.leftMargin = leftInsets
132 mlpScrollAbout.rightMargin = rightInsets
133 binding.listLicenses.layoutParams = mlpScrollAbout
134 127
135 binding.listLicenses.updatePadding(bottom = barInsets.bottom) 128 binding.listLicenses.updatePadding(bottom = barInsets.bottom)
136 129
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt
index f95d545bf..a135b80b4 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt
@@ -29,6 +29,7 @@ import org.yuzu.yuzu_emu.features.settings.model.view.SettingsItem
29import org.yuzu.yuzu_emu.features.settings.ui.SettingsAdapter 29import org.yuzu.yuzu_emu.features.settings.ui.SettingsAdapter
30import org.yuzu.yuzu_emu.model.SettingsViewModel 30import org.yuzu.yuzu_emu.model.SettingsViewModel
31import org.yuzu.yuzu_emu.utils.NativeConfig 31import org.yuzu.yuzu_emu.utils.NativeConfig
32import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
32 33
33class SettingsSearchFragment : Fragment() { 34class SettingsSearchFragment : Fragment() {
34 private var _binding: FragmentSettingsSearchBinding? = null 35 private var _binding: FragmentSettingsSearchBinding? = null
@@ -174,15 +175,14 @@ class SettingsSearchFragment : Fragment() {
174 bottom = barInsets.bottom 175 bottom = barInsets.bottom
175 ) 176 )
176 177
177 val mlpSettingsList = binding.settingsList.layoutParams as ViewGroup.MarginLayoutParams 178 binding.settingsList.updateMargins(
178 mlpSettingsList.leftMargin = leftInsets + sideMargin 179 left = leftInsets + sideMargin,
179 mlpSettingsList.rightMargin = rightInsets + sideMargin 180 right = rightInsets + sideMargin
180 binding.settingsList.layoutParams = mlpSettingsList 181 )
181 182 binding.divider.updateMargins(
182 val mlpDivider = binding.divider.layoutParams as ViewGroup.MarginLayoutParams 183 left = leftInsets + sideMargin,
183 mlpDivider.leftMargin = leftInsets + sideMargin 184 right = rightInsets + sideMargin
184 mlpDivider.rightMargin = rightInsets + sideMargin 185 )
185 binding.divider.layoutParams = mlpDivider
186 186
187 windowInsets 187 windowInsets
188 } 188 }
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt
index 54380323e..23ca49b53 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt
@@ -8,7 +8,6 @@ import android.os.Bundle
8import android.view.LayoutInflater 8import android.view.LayoutInflater
9import android.view.View 9import android.view.View
10import android.view.ViewGroup 10import android.view.ViewGroup
11import android.view.ViewGroup.MarginLayoutParams
12import androidx.appcompat.app.AppCompatActivity 11import androidx.appcompat.app.AppCompatActivity
13import androidx.core.view.ViewCompat 12import androidx.core.view.ViewCompat
14import androidx.core.view.WindowInsetsCompat 13import androidx.core.view.WindowInsetsCompat
@@ -27,6 +26,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentGamesBinding
27import org.yuzu.yuzu_emu.layout.AutofitGridLayoutManager 26import org.yuzu.yuzu_emu.layout.AutofitGridLayoutManager
28import org.yuzu.yuzu_emu.model.GamesViewModel 27import org.yuzu.yuzu_emu.model.GamesViewModel
29import org.yuzu.yuzu_emu.model.HomeViewModel 28import org.yuzu.yuzu_emu.model.HomeViewModel
29import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
30 30
31class GamesFragment : Fragment() { 31class GamesFragment : Fragment() {
32 private var _binding: FragmentGamesBinding? = null 32 private var _binding: FragmentGamesBinding? = null
@@ -169,15 +169,16 @@ class GamesFragment : Fragment() {
169 169
170 val leftInsets = barInsets.left + cutoutInsets.left 170 val leftInsets = barInsets.left + cutoutInsets.left
171 val rightInsets = barInsets.right + cutoutInsets.right 171 val rightInsets = barInsets.right + cutoutInsets.right
172 val mlpSwipe = binding.swipeRefresh.layoutParams as MarginLayoutParams 172 val left: Int
173 val right: Int
173 if (ViewCompat.getLayoutDirection(view) == ViewCompat.LAYOUT_DIRECTION_LTR) { 174 if (ViewCompat.getLayoutDirection(view) == ViewCompat.LAYOUT_DIRECTION_LTR) {
174 mlpSwipe.leftMargin = leftInsets + spacingNavigationRail 175 left = leftInsets + spacingNavigationRail
175 mlpSwipe.rightMargin = rightInsets 176 right = rightInsets
176 } else { 177 } else {
177 mlpSwipe.leftMargin = leftInsets 178 left = leftInsets
178 mlpSwipe.rightMargin = rightInsets + spacingNavigationRail 179 right = rightInsets + spacingNavigationRail
179 } 180 }
180 binding.swipeRefresh.layoutParams = mlpSwipe 181 binding.swipeRefresh.updateMargins(left = left, right = right)
181 182
182 binding.noticeText.updatePadding(bottom = spacingNavigation) 183 binding.noticeText.updatePadding(bottom = spacingNavigation)
183 184
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
index b3967d294..4df4ac4c6 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
@@ -34,7 +34,6 @@ import kotlinx.coroutines.launch
34import org.yuzu.yuzu_emu.HomeNavigationDirections 34import org.yuzu.yuzu_emu.HomeNavigationDirections
35import org.yuzu.yuzu_emu.NativeLibrary 35import org.yuzu.yuzu_emu.NativeLibrary
36import org.yuzu.yuzu_emu.R 36import org.yuzu.yuzu_emu.R
37import org.yuzu.yuzu_emu.activities.EmulationActivity
38import org.yuzu.yuzu_emu.databinding.ActivityMainBinding 37import org.yuzu.yuzu_emu.databinding.ActivityMainBinding
39import org.yuzu.yuzu_emu.features.settings.model.Settings 38import org.yuzu.yuzu_emu.features.settings.model.Settings
40import org.yuzu.yuzu_emu.fragments.AddGameFolderDialogFragment 39import org.yuzu.yuzu_emu.fragments.AddGameFolderDialogFragment
@@ -177,9 +176,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
177 } 176 }
178 } 177 }
179 178
180 // Dismiss previous notifications (should not happen unless a crash occurred)
181 EmulationActivity.stopForegroundService(this)
182
183 setInsets() 179 setInsets()
184 } 180 }
185 181
@@ -298,11 +294,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
298 super.onResume() 294 super.onResume()
299 } 295 }
300 296
301 override fun onDestroy() {
302 EmulationActivity.stopForegroundService(this)
303 super.onDestroy()
304 }
305
306 private fun setInsets() = 297 private fun setInsets() =
307 ViewCompat.setOnApplyWindowInsetsListener( 298 ViewCompat.setOnApplyWindowInsetsListener(
308 binding.root 299 binding.root
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt
index f9a3e4126..ffbfa9337 100755
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt
@@ -4,6 +4,7 @@
4package org.yuzu.yuzu_emu.utils 4package org.yuzu.yuzu_emu.utils
5 5
6import android.view.View 6import android.view.View
7import android.view.ViewGroup
7 8
8object ViewUtils { 9object ViewUtils {
9 fun showView(view: View, length: Long = 300) { 10 fun showView(view: View, length: Long = 300) {
@@ -32,4 +33,28 @@ object ViewUtils {
32 view.visibility = View.INVISIBLE 33 view.visibility = View.INVISIBLE
33 }.start() 34 }.start()
34 } 35 }
36
37 fun View.updateMargins(
38 left: Int = -1,
39 top: Int = -1,
40 right: Int = -1,
41 bottom: Int = -1
42 ) {
43 val layoutParams = this.layoutParams as ViewGroup.MarginLayoutParams
44 layoutParams.apply {
45 if (left != -1) {
46 leftMargin = left
47 }
48 if (top != -1) {
49 topMargin = top
50 }
51 if (right != -1) {
52 rightMargin = right
53 }
54 if (bottom != -1) {
55 bottomMargin = bottom
56 }
57 }
58 this.layoutParams = layoutParams
59 }
35} 60}
diff --git a/src/android/app/src/main/jni/CMakeLists.txt b/src/android/app/src/main/jni/CMakeLists.txt
index abc6055ab..20b319c12 100755
--- a/src/android/app/src/main/jni/CMakeLists.txt
+++ b/src/android/app/src/main/jni/CMakeLists.txt
@@ -2,14 +2,8 @@
2# SPDX-License-Identifier: GPL-3.0-or-later 2# SPDX-License-Identifier: GPL-3.0-or-later
3 3
4add_library(yuzu-android SHARED 4add_library(yuzu-android SHARED
5 android_common/android_common.cpp
6 android_common/android_common.h
7 applets/software_keyboard.cpp
8 applets/software_keyboard.h
9 emu_window/emu_window.cpp 5 emu_window/emu_window.cpp
10 emu_window/emu_window.h 6 emu_window/emu_window.h
11 id_cache.cpp
12 id_cache.h
13 native.cpp 7 native.cpp
14 native.h 8 native.h
15 native_config.cpp 9 native_config.cpp
diff --git a/src/android/app/src/main/jni/android_settings.h b/src/android/app/src/main/jni/android_settings.h
index cf93304da..4a3bc8e53 100755
--- a/src/android/app/src/main/jni/android_settings.h
+++ b/src/android/app/src/main/jni/android_settings.h
@@ -60,6 +60,8 @@ struct Values {
60 Settings::Category::Overlay}; 60 Settings::Category::Overlay};
61 Settings::Setting<bool> show_performance_overlay{linkage, true, "show_performance_overlay", 61 Settings::Setting<bool> show_performance_overlay{linkage, true, "show_performance_overlay",
62 Settings::Category::Overlay}; 62 Settings::Category::Overlay};
63 Settings::Setting<bool> show_thermal_overlay{linkage, false, "show_thermal_overlay",
64 Settings::Category::Overlay};
63 Settings::Setting<bool> show_input_overlay{linkage, true, "show_input_overlay", 65 Settings::Setting<bool> show_input_overlay{linkage, true, "show_input_overlay",
64 Settings::Category::Overlay}; 66 Settings::Category::Overlay};
65 Settings::Setting<bool> touchscreen{linkage, true, "touchscreen", Settings::Category::Overlay}; 67 Settings::Setting<bool> touchscreen{linkage, true, "touchscreen", Settings::Category::Overlay};
diff --git a/src/android/app/src/main/jni/emu_window/emu_window.cpp b/src/android/app/src/main/jni/emu_window/emu_window.cpp
index c4f631924..c927cddda 100755
--- a/src/android/app/src/main/jni/emu_window/emu_window.cpp
+++ b/src/android/app/src/main/jni/emu_window/emu_window.cpp
@@ -3,6 +3,7 @@
3 3
4#include <android/native_window_jni.h> 4#include <android/native_window_jni.h>
5 5
6#include "common/android/id_cache.h"
6#include "common/logging/log.h" 7#include "common/logging/log.h"
7#include "input_common/drivers/touch_screen.h" 8#include "input_common/drivers/touch_screen.h"
8#include "input_common/drivers/virtual_amiibo.h" 9#include "input_common/drivers/virtual_amiibo.h"
@@ -60,7 +61,8 @@ void EmuWindow_Android::OnRemoveNfcTag() {
60 61
61void EmuWindow_Android::OnFrameDisplayed() { 62void EmuWindow_Android::OnFrameDisplayed() {
62 if (!m_first_frame) { 63 if (!m_first_frame) {
63 EmulationSession::GetInstance().OnEmulationStarted(); 64 Common::Android::RunJNIOnFiber<void>(
65 [&](JNIEnv* env) { EmulationSession::GetInstance().OnEmulationStarted(); });
64 m_first_frame = true; 66 m_first_frame = true;
65 } 67 }
66} 68}
diff --git a/src/android/app/src/main/jni/game_metadata.cpp b/src/android/app/src/main/jni/game_metadata.cpp
index 8f0da1413..c33763b47 100755
--- a/src/android/app/src/main/jni/game_metadata.cpp
+++ b/src/android/app/src/main/jni/game_metadata.cpp
@@ -1,13 +1,12 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-2.0-or-later
3 3
4#include "common/android/android_common.h"
4#include "core/core.h" 5#include "core/core.h"
5#include "core/file_sys/fs_filesystem.h" 6#include "core/file_sys/fs_filesystem.h"
6#include "core/file_sys/patch_manager.h" 7#include "core/file_sys/patch_manager.h"
7#include "core/loader/loader.h" 8#include "core/loader/loader.h"
8#include "core/loader/nro.h" 9#include "core/loader/nro.h"
9#include "jni.h"
10#include "jni/android_common/android_common.h"
11#include "native.h" 10#include "native.h"
12 11
13struct RomMetadata { 12struct RomMetadata {
@@ -79,7 +78,7 @@ extern "C" {
79jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobject obj, 78jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobject obj,
80 jstring jpath) { 79 jstring jpath) {
81 const auto file = EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( 80 const auto file = EmulationSession::GetInstance().System().GetFilesystem()->OpenFile(
82 GetJString(env, jpath), FileSys::OpenMode::Read); 81 Common::Android::GetJString(env, jpath), FileSys::OpenMode::Read);
83 if (!file) { 82 if (!file) {
84 return false; 83 return false;
85 } 84 }
@@ -104,27 +103,31 @@ jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobj
104 103
105jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getTitle(JNIEnv* env, jobject obj, 104jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getTitle(JNIEnv* env, jobject obj,
106 jstring jpath) { 105 jstring jpath) {
107 return ToJString(env, GetRomMetadata(GetJString(env, jpath)).title); 106 return Common::Android::ToJString(
107 env, GetRomMetadata(Common::Android::GetJString(env, jpath)).title);
108} 108}
109 109
110jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getProgramId(JNIEnv* env, jobject obj, 110jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getProgramId(JNIEnv* env, jobject obj,
111 jstring jpath) { 111 jstring jpath) {
112 return ToJString(env, std::to_string(GetRomMetadata(GetJString(env, jpath)).programId)); 112 return Common::Android::ToJString(
113 env, std::to_string(GetRomMetadata(Common::Android::GetJString(env, jpath)).programId));
113} 114}
114 115
115jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getDeveloper(JNIEnv* env, jobject obj, 116jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getDeveloper(JNIEnv* env, jobject obj,
116 jstring jpath) { 117 jstring jpath) {
117 return ToJString(env, GetRomMetadata(GetJString(env, jpath)).developer); 118 return Common::Android::ToJString(
119 env, GetRomMetadata(Common::Android::GetJString(env, jpath)).developer);
118} 120}
119 121
120jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getVersion(JNIEnv* env, jobject obj, 122jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getVersion(JNIEnv* env, jobject obj,
121 jstring jpath, jboolean jreload) { 123 jstring jpath, jboolean jreload) {
122 return ToJString(env, GetRomMetadata(GetJString(env, jpath), jreload).version); 124 return Common::Android::ToJString(
125 env, GetRomMetadata(Common::Android::GetJString(env, jpath), jreload).version);
123} 126}
124 127
125jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobject obj, 128jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobject obj,
126 jstring jpath) { 129 jstring jpath) {
127 auto icon_data = GetRomMetadata(GetJString(env, jpath)).icon; 130 auto icon_data = GetRomMetadata(Common::Android::GetJString(env, jpath)).icon;
128 jbyteArray icon = env->NewByteArray(static_cast<jsize>(icon_data.size())); 131 jbyteArray icon = env->NewByteArray(static_cast<jsize>(icon_data.size()));
129 env->SetByteArrayRegion(icon, 0, env->GetArrayLength(icon), 132 env->SetByteArrayRegion(icon, 0, env->GetArrayLength(icon),
130 reinterpret_cast<jbyte*>(icon_data.data())); 133 reinterpret_cast<jbyte*>(icon_data.data()));
@@ -133,7 +136,8 @@ jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobje
133 136
134jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsHomebrew(JNIEnv* env, jobject obj, 137jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsHomebrew(JNIEnv* env, jobject obj,
135 jstring jpath) { 138 jstring jpath) {
136 return static_cast<jboolean>(GetRomMetadata(GetJString(env, jpath)).isHomebrew); 139 return static_cast<jboolean>(
140 GetRomMetadata(Common::Android::GetJString(env, jpath)).isHomebrew);
137} 141}
138 142
139void Java_org_yuzu_yuzu_1emu_utils_GameMetadata_resetMetadata(JNIEnv* env, jobject obj) { 143void Java_org_yuzu_yuzu_1emu_utils_GameMetadata_resetMetadata(JNIEnv* env, jobject obj) {
diff --git a/src/android/app/src/main/jni/native.cpp b/src/android/app/src/main/jni/native.cpp
index 654510129..4acc60956 100755
--- a/src/android/app/src/main/jni/native.cpp
+++ b/src/android/app/src/main/jni/native.cpp
@@ -20,6 +20,8 @@
20#include <frontend_common/content_manager.h> 20#include <frontend_common/content_manager.h>
21#include <jni.h> 21#include <jni.h>
22 22
23#include "common/android/android_common.h"
24#include "common/android/id_cache.h"
23#include "common/detached_tasks.h" 25#include "common/detached_tasks.h"
24#include "common/dynamic_library.h" 26#include "common/dynamic_library.h"
25#include "common/fs/path_util.h" 27#include "common/fs/path_util.h"
@@ -57,8 +59,6 @@
57#include "hid_core/frontend/emulated_controller.h" 59#include "hid_core/frontend/emulated_controller.h"
58#include "hid_core/hid_core.h" 60#include "hid_core/hid_core.h"
59#include "hid_core/hid_types.h" 61#include "hid_core/hid_types.h"
60#include "jni/android_common/android_common.h"
61#include "jni/id_cache.h"
62#include "jni/native.h" 62#include "jni/native.h"
63#include "video_core/renderer_base.h" 63#include "video_core/renderer_base.h"
64#include "video_core/renderer_vulkan/renderer_vulkan.h" 64#include "video_core/renderer_vulkan/renderer_vulkan.h"
@@ -228,7 +228,7 @@ Core::SystemResultStatus EmulationSession::InitializeEmulation(const std::string
228 std::make_unique<EmuWindow_Android>(&m_input_subsystem, m_native_window, m_vulkan_library); 228 std::make_unique<EmuWindow_Android>(&m_input_subsystem, m_native_window, m_vulkan_library);
229 229
230 // Initialize system. 230 // Initialize system.
231 jauto android_keyboard = std::make_unique<SoftwareKeyboard::AndroidKeyboard>(); 231 jauto android_keyboard = std::make_unique<Common::Android::SoftwareKeyboard::AndroidKeyboard>();
232 m_software_keyboard = android_keyboard.get(); 232 m_software_keyboard = android_keyboard.get();
233 m_system.SetShuttingDown(false); 233 m_system.SetShuttingDown(false);
234 m_system.ApplySettings(); 234 m_system.ApplySettings();
@@ -411,37 +411,39 @@ void EmulationSession::OnGamepadDisconnectEvent([[maybe_unused]] int index) {
411 controller->Disconnect(); 411 controller->Disconnect();
412} 412}
413 413
414SoftwareKeyboard::AndroidKeyboard* EmulationSession::SoftwareKeyboard() { 414Common::Android::SoftwareKeyboard::AndroidKeyboard* EmulationSession::SoftwareKeyboard() {
415 return m_software_keyboard; 415 return m_software_keyboard;
416} 416}
417 417
418void EmulationSession::LoadDiskCacheProgress(VideoCore::LoadCallbackStage stage, int progress, 418void EmulationSession::LoadDiskCacheProgress(VideoCore::LoadCallbackStage stage, int progress,
419 int max) { 419 int max) {
420 JNIEnv* env = IDCache::GetEnvForThread(); 420 JNIEnv* env = Common::Android::GetEnvForThread();
421 env->CallStaticVoidMethod(IDCache::GetDiskCacheProgressClass(), 421 env->CallStaticVoidMethod(Common::Android::GetDiskCacheProgressClass(),
422 IDCache::GetDiskCacheLoadProgress(), static_cast<jint>(stage), 422 Common::Android::GetDiskCacheLoadProgress(), static_cast<jint>(stage),
423 static_cast<jint>(progress), static_cast<jint>(max)); 423 static_cast<jint>(progress), static_cast<jint>(max));
424} 424}
425 425
426void EmulationSession::OnEmulationStarted() { 426void EmulationSession::OnEmulationStarted() {
427 JNIEnv* env = IDCache::GetEnvForThread(); 427 JNIEnv* env = Common::Android::GetEnvForThread();
428 env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnEmulationStarted()); 428 env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(),
429 Common::Android::GetOnEmulationStarted());
429} 430}
430 431
431void EmulationSession::OnEmulationStopped(Core::SystemResultStatus result) { 432void EmulationSession::OnEmulationStopped(Core::SystemResultStatus result) {
432 JNIEnv* env = IDCache::GetEnvForThread(); 433 JNIEnv* env = Common::Android::GetEnvForThread();
433 env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnEmulationStopped(), 434 env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(),
434 static_cast<jint>(result)); 435 Common::Android::GetOnEmulationStopped(), static_cast<jint>(result));
435} 436}
436 437
437void EmulationSession::ChangeProgram(std::size_t program_index) { 438void EmulationSession::ChangeProgram(std::size_t program_index) {
438 JNIEnv* env = IDCache::GetEnvForThread(); 439 JNIEnv* env = Common::Android::GetEnvForThread();
439 env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnProgramChanged(), 440 env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(),
441 Common::Android::GetOnProgramChanged(),
440 static_cast<jint>(program_index)); 442 static_cast<jint>(program_index));
441} 443}
442 444
443u64 EmulationSession::GetProgramId(JNIEnv* env, jstring jprogramId) { 445u64 EmulationSession::GetProgramId(JNIEnv* env, jstring jprogramId) {
444 auto program_id_string = GetJString(env, jprogramId); 446 auto program_id_string = Common::Android::GetJString(env, jprogramId);
445 try { 447 try {
446 return std::stoull(program_id_string); 448 return std::stoull(program_id_string);
447 } catch (...) { 449 } catch (...) {
@@ -491,7 +493,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_surfaceDestroyed(JNIEnv* env, jobject
491 493
492void Java_org_yuzu_yuzu_1emu_NativeLibrary_setAppDirectory(JNIEnv* env, jobject instance, 494void Java_org_yuzu_yuzu_1emu_NativeLibrary_setAppDirectory(JNIEnv* env, jobject instance,
493 [[maybe_unused]] jstring j_directory) { 495 [[maybe_unused]] jstring j_directory) {
494 Common::FS::SetAppDirectory(GetJString(env, j_directory)); 496 Common::FS::SetAppDirectory(Common::Android::GetJString(env, j_directory));
495} 497}
496 498
497int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject instance, 499int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject instance,
@@ -501,21 +503,22 @@ int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject
501 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); 503 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
502 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { 504 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) {
503 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, 505 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod,
504 ToJDouble(env, max), ToJDouble(env, progress)); 506 Common::Android::ToJDouble(env, max),
505 return GetJBoolean(env, jwasCancelled); 507 Common::Android::ToJDouble(env, progress));
508 return Common::Android::GetJBoolean(env, jwasCancelled);
506 }; 509 };
507 510
508 return static_cast<int>( 511 return static_cast<int>(
509 ContentManager::InstallNSP(EmulationSession::GetInstance().System(), 512 ContentManager::InstallNSP(EmulationSession::GetInstance().System(),
510 *EmulationSession::GetInstance().System().GetFilesystem(), 513 *EmulationSession::GetInstance().System().GetFilesystem(),
511 GetJString(env, j_file), callback)); 514 Common::Android::GetJString(env, j_file), callback));
512} 515}
513 516
514jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_doesUpdateMatchProgram(JNIEnv* env, jobject jobj, 517jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_doesUpdateMatchProgram(JNIEnv* env, jobject jobj,
515 jstring jprogramId, 518 jstring jprogramId,
516 jstring jupdatePath) { 519 jstring jupdatePath) {
517 u64 program_id = EmulationSession::GetProgramId(env, jprogramId); 520 u64 program_id = EmulationSession::GetProgramId(env, jprogramId);
518 std::string updatePath = GetJString(env, jupdatePath); 521 std::string updatePath = Common::Android::GetJString(env, jupdatePath);
519 std::shared_ptr<FileSys::NSP> nsp = std::make_shared<FileSys::NSP>( 522 std::shared_ptr<FileSys::NSP> nsp = std::make_shared<FileSys::NSP>(
520 EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( 523 EmulationSession::GetInstance().System().GetFilesystem()->OpenFile(
521 updatePath, FileSys::OpenMode::Read)); 524 updatePath, FileSys::OpenMode::Read));
@@ -538,8 +541,10 @@ void JNICALL Java_org_yuzu_yuzu_1emu_NativeLibrary_initializeGpuDriver(JNIEnv* e
538 jstring custom_driver_name, 541 jstring custom_driver_name,
539 jstring file_redirect_dir) { 542 jstring file_redirect_dir) {
540 EmulationSession::GetInstance().InitializeGpuDriver( 543 EmulationSession::GetInstance().InitializeGpuDriver(
541 GetJString(env, hook_lib_dir), GetJString(env, custom_driver_dir), 544 Common::Android::GetJString(env, hook_lib_dir),
542 GetJString(env, custom_driver_name), GetJString(env, file_redirect_dir)); 545 Common::Android::GetJString(env, custom_driver_dir),
546 Common::Android::GetJString(env, custom_driver_name),
547 Common::Android::GetJString(env, file_redirect_dir));
543} 548}
544 549
545[[maybe_unused]] static bool CheckKgslPresent() { 550[[maybe_unused]] static bool CheckKgslPresent() {
@@ -566,7 +571,7 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_GpuDriverHelper_getSystemDriverInfo(
566 JNIEnv* env, jobject j_obj, jobject j_surf, jstring j_hook_lib_dir) { 571 JNIEnv* env, jobject j_obj, jobject j_surf, jstring j_hook_lib_dir) {
567 const char* file_redirect_dir_{}; 572 const char* file_redirect_dir_{};
568 int featureFlags{}; 573 int featureFlags{};
569 std::string hook_lib_dir = GetJString(env, j_hook_lib_dir); 574 std::string hook_lib_dir = Common::Android::GetJString(env, j_hook_lib_dir);
570 auto handle = adrenotools_open_libvulkan(RTLD_NOW, featureFlags, nullptr, hook_lib_dir.c_str(), 575 auto handle = adrenotools_open_libvulkan(RTLD_NOW, featureFlags, nullptr, hook_lib_dir.c_str(),
571 nullptr, nullptr, file_redirect_dir_, nullptr); 576 nullptr, nullptr, file_redirect_dir_, nullptr);
572 auto driver_library = std::make_shared<Common::DynamicLibrary>(handle); 577 auto driver_library = std::make_shared<Common::DynamicLibrary>(handle);
@@ -587,9 +592,10 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_GpuDriverHelper_getSystemDriverInfo(
587 fmt::format("{}.{}.{}", VK_API_VERSION_MAJOR(driver_version), 592 fmt::format("{}.{}.{}", VK_API_VERSION_MAJOR(driver_version),
588 VK_API_VERSION_MINOR(driver_version), VK_API_VERSION_PATCH(driver_version)); 593 VK_API_VERSION_MINOR(driver_version), VK_API_VERSION_PATCH(driver_version));
589 594
590 jobjectArray j_driver_info = 595 jobjectArray j_driver_info = env->NewObjectArray(
591 env->NewObjectArray(2, IDCache::GetStringClass(), ToJString(env, version_string)); 596 2, Common::Android::GetStringClass(), Common::Android::ToJString(env, version_string));
592 env->SetObjectArrayElement(j_driver_info, 1, ToJString(env, device.GetDriverName())); 597 env->SetObjectArrayElement(j_driver_info, 1,
598 Common::Android::ToJString(env, device.GetDriverName()));
593 return j_driver_info; 599 return j_driver_info;
594} 600}
595 601
@@ -742,15 +748,15 @@ jdoubleArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPerfStats(JNIEnv* env, jcl
742 748
743jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getCpuBackend(JNIEnv* env, jclass clazz) { 749jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getCpuBackend(JNIEnv* env, jclass clazz) {
744 if (Settings::IsNceEnabled()) { 750 if (Settings::IsNceEnabled()) {
745 return ToJString(env, "NCE"); 751 return Common::Android::ToJString(env, "NCE");
746 } 752 }
747 753
748 return ToJString(env, "JIT"); 754 return Common::Android::ToJString(env, "JIT");
749} 755}
750 756
751jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getGpuDriver(JNIEnv* env, jobject jobj) { 757jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getGpuDriver(JNIEnv* env, jobject jobj) {
752 return ToJString(env, 758 return Common::Android::ToJString(
753 EmulationSession::GetInstance().System().GPU().Renderer().GetDeviceVendor()); 759 env, EmulationSession::GetInstance().System().GPU().Renderer().GetDeviceVendor());
754} 760}
755 761
756void Java_org_yuzu_yuzu_1emu_NativeLibrary_applySettings(JNIEnv* env, jobject jobj) { 762void Java_org_yuzu_yuzu_1emu_NativeLibrary_applySettings(JNIEnv* env, jobject jobj) {
@@ -764,13 +770,14 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_logSettings(JNIEnv* env, jobject jobj
764void Java_org_yuzu_yuzu_1emu_NativeLibrary_run(JNIEnv* env, jobject jobj, jstring j_path, 770void Java_org_yuzu_yuzu_1emu_NativeLibrary_run(JNIEnv* env, jobject jobj, jstring j_path,
765 jint j_program_index, 771 jint j_program_index,
766 jboolean j_frontend_initiated) { 772 jboolean j_frontend_initiated) {
767 const std::string path = GetJString(env, j_path); 773 const std::string path = Common::Android::GetJString(env, j_path);
768 774
769 const Core::SystemResultStatus result{ 775 const Core::SystemResultStatus result{
770 RunEmulation(path, j_program_index, j_frontend_initiated)}; 776 RunEmulation(path, j_program_index, j_frontend_initiated)};
771 if (result != Core::SystemResultStatus::Success) { 777 if (result != Core::SystemResultStatus::Success) {
772 env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), 778 env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(),
773 IDCache::GetExitEmulationActivity(), static_cast<int>(result)); 779 Common::Android::GetExitEmulationActivity(),
780 static_cast<int>(result));
774 } 781 }
775} 782}
776 783
@@ -781,7 +788,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_logDeviceInfo(JNIEnv* env, jclass cla
781 788
782void Java_org_yuzu_yuzu_1emu_NativeLibrary_submitInlineKeyboardText(JNIEnv* env, jclass clazz, 789void Java_org_yuzu_yuzu_1emu_NativeLibrary_submitInlineKeyboardText(JNIEnv* env, jclass clazz,
783 jstring j_text) { 790 jstring j_text) {
784 const std::u16string input = Common::UTF8ToUTF16(GetJString(env, j_text)); 791 const std::u16string input = Common::UTF8ToUTF16(Common::Android::GetJString(env, j_text));
785 EmulationSession::GetInstance().SoftwareKeyboard()->SubmitInlineKeyboardText(input); 792 EmulationSession::GetInstance().SoftwareKeyboard()->SubmitInlineKeyboardText(input);
786} 793}
787 794
@@ -815,16 +822,16 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getAppletLaunchPath(JNIEnv* env, j
815 auto bis_system = 822 auto bis_system =
816 EmulationSession::GetInstance().System().GetFileSystemController().GetSystemNANDContents(); 823 EmulationSession::GetInstance().System().GetFileSystemController().GetSystemNANDContents();
817 if (!bis_system) { 824 if (!bis_system) {
818 return ToJString(env, ""); 825 return Common::Android::ToJString(env, "");
819 } 826 }
820 827
821 auto applet_nca = 828 auto applet_nca =
822 bis_system->GetEntry(static_cast<u64>(jid), FileSys::ContentRecordType::Program); 829 bis_system->GetEntry(static_cast<u64>(jid), FileSys::ContentRecordType::Program);
823 if (!applet_nca) { 830 if (!applet_nca) {
824 return ToJString(env, ""); 831 return Common::Android::ToJString(env, "");
825 } 832 }
826 833
827 return ToJString(env, applet_nca->GetFullPath()); 834 return Common::Android::ToJString(env, applet_nca->GetFullPath());
828} 835}
829 836
830void Java_org_yuzu_yuzu_1emu_NativeLibrary_setCurrentAppletId(JNIEnv* env, jclass clazz, 837void Java_org_yuzu_yuzu_1emu_NativeLibrary_setCurrentAppletId(JNIEnv* env, jclass clazz,
@@ -857,7 +864,7 @@ jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_isFirmwareAvailable(JNIEnv* env,
857jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env, jobject jobj, 864jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env, jobject jobj,
858 jstring jpath, 865 jstring jpath,
859 jstring jprogramId) { 866 jstring jprogramId) {
860 const auto path = GetJString(env, jpath); 867 const auto path = Common::Android::GetJString(env, jpath);
861 const auto vFile = 868 const auto vFile =
862 Core::GetGameFileFromPath(EmulationSession::GetInstance().System().GetFilesystem(), path); 869 Core::GetGameFileFromPath(EmulationSession::GetInstance().System().GetFilesystem(), path);
863 if (vFile == nullptr) { 870 if (vFile == nullptr) {
@@ -875,14 +882,15 @@ jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env
875 882
876 auto patches = pm.GetPatches(update_raw); 883 auto patches = pm.GetPatches(update_raw);
877 jobjectArray jpatchArray = 884 jobjectArray jpatchArray =
878 env->NewObjectArray(patches.size(), IDCache::GetPatchClass(), nullptr); 885 env->NewObjectArray(patches.size(), Common::Android::GetPatchClass(), nullptr);
879 int i = 0; 886 int i = 0;
880 for (const auto& patch : patches) { 887 for (const auto& patch : patches) {
881 jobject jpatch = env->NewObject( 888 jobject jpatch = env->NewObject(
882 IDCache::GetPatchClass(), IDCache::GetPatchConstructor(), patch.enabled, 889 Common::Android::GetPatchClass(), Common::Android::GetPatchConstructor(), patch.enabled,
883 ToJString(env, patch.name), ToJString(env, patch.version), 890 Common::Android::ToJString(env, patch.name),
884 static_cast<jint>(patch.type), ToJString(env, std::to_string(patch.program_id)), 891 Common::Android::ToJString(env, patch.version), static_cast<jint>(patch.type),
885 ToJString(env, std::to_string(patch.title_id))); 892 Common::Android::ToJString(env, std::to_string(patch.program_id)),
893 Common::Android::ToJString(env, std::to_string(patch.title_id)));
886 env->SetObjectArrayElement(jpatchArray, i, jpatch); 894 env->SetObjectArrayElement(jpatchArray, i, jpatch);
887 ++i; 895 ++i;
888 } 896 }
@@ -906,7 +914,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_removeMod(JNIEnv* env, jobject jobj,
906 jstring jname) { 914 jstring jname) {
907 auto program_id = EmulationSession::GetProgramId(env, jprogramId); 915 auto program_id = EmulationSession::GetProgramId(env, jprogramId);
908 ContentManager::RemoveMod(EmulationSession::GetInstance().System().GetFileSystemController(), 916 ContentManager::RemoveMod(EmulationSession::GetInstance().System().GetFileSystemController(),
909 program_id, GetJString(env, jname)); 917 program_id, Common::Android::GetJString(env, jname));
910} 918}
911 919
912jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEnv* env, 920jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEnv* env,
@@ -917,17 +925,18 @@ jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEn
917 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); 925 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
918 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { 926 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) {
919 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, 927 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod,
920 ToJDouble(env, max), ToJDouble(env, progress)); 928 Common::Android::ToJDouble(env, max),
921 return GetJBoolean(env, jwasCancelled); 929 Common::Android::ToJDouble(env, progress));
930 return Common::Android::GetJBoolean(env, jwasCancelled);
922 }; 931 };
923 932
924 auto& session = EmulationSession::GetInstance(); 933 auto& session = EmulationSession::GetInstance();
925 std::vector<std::string> result = ContentManager::VerifyInstalledContents( 934 std::vector<std::string> result = ContentManager::VerifyInstalledContents(
926 session.System(), *session.GetContentProvider(), callback); 935 session.System(), *session.GetContentProvider(), callback);
927 jobjectArray jresult = 936 jobjectArray jresult = env->NewObjectArray(result.size(), Common::Android::GetStringClass(),
928 env->NewObjectArray(result.size(), IDCache::GetStringClass(), ToJString(env, "")); 937 Common::Android::ToJString(env, ""));
929 for (size_t i = 0; i < result.size(); ++i) { 938 for (size_t i = 0; i < result.size(); ++i) {
930 env->SetObjectArrayElement(jresult, i, ToJString(env, result[i])); 939 env->SetObjectArrayElement(jresult, i, Common::Android::ToJString(env, result[i]));
931 } 940 }
932 return jresult; 941 return jresult;
933} 942}
@@ -939,19 +948,20 @@ jint Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyGameContents(JNIEnv* env, jobje
939 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); 948 jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
940 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { 949 const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) {
941 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, 950 auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod,
942 ToJDouble(env, max), ToJDouble(env, progress)); 951 Common::Android::ToJDouble(env, max),
943 return GetJBoolean(env, jwasCancelled); 952 Common::Android::ToJDouble(env, progress));
953 return Common::Android::GetJBoolean(env, jwasCancelled);
944 }; 954 };
945 auto& session = EmulationSession::GetInstance(); 955 auto& session = EmulationSession::GetInstance();
946 return static_cast<jint>( 956 return static_cast<jint>(ContentManager::VerifyGameContents(
947 ContentManager::VerifyGameContents(session.System(), GetJString(env, jpath), callback)); 957 session.System(), Common::Android::GetJString(env, jpath), callback));
948} 958}
949 959
950jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject jobj, 960jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject jobj,
951 jstring jprogramId) { 961 jstring jprogramId) {
952 auto program_id = EmulationSession::GetProgramId(env, jprogramId); 962 auto program_id = EmulationSession::GetProgramId(env, jprogramId);
953 if (program_id == 0) { 963 if (program_id == 0) {
954 return ToJString(env, ""); 964 return Common::Android::ToJString(env, "");
955 } 965 }
956 966
957 auto& system = EmulationSession::GetInstance().System(); 967 auto& system = EmulationSession::GetInstance().System();
@@ -968,7 +978,7 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject j
968 const auto user_save_data_path = FileSys::SaveDataFactory::GetFullPath( 978 const auto user_save_data_path = FileSys::SaveDataFactory::GetFullPath(
969 {}, vfsNandDir, FileSys::SaveDataSpaceId::NandUser, FileSys::SaveDataType::SaveData, 979 {}, vfsNandDir, FileSys::SaveDataSpaceId::NandUser, FileSys::SaveDataType::SaveData,
970 program_id, user_id->AsU128(), 0); 980 program_id, user_id->AsU128(), 0);
971 return ToJString(env, user_save_data_path); 981 return Common::Android::ToJString(env, user_save_data_path);
972} 982}
973 983
974jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIEnv* env, 984jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIEnv* env,
@@ -981,12 +991,13 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIE
981 991
982 const auto user_save_data_root = 992 const auto user_save_data_root =
983 FileSys::SaveDataFactory::GetUserGameSaveDataRoot(user_id->AsU128(), jfuture); 993 FileSys::SaveDataFactory::GetUserGameSaveDataRoot(user_id->AsU128(), jfuture);
984 return ToJString(env, user_save_data_root); 994 return Common::Android::ToJString(env, user_save_data_root);
985} 995}
986 996
987void Java_org_yuzu_yuzu_1emu_NativeLibrary_addFileToFilesystemProvider(JNIEnv* env, jobject jobj, 997void Java_org_yuzu_yuzu_1emu_NativeLibrary_addFileToFilesystemProvider(JNIEnv* env, jobject jobj,
988 jstring jpath) { 998 jstring jpath) {
989 EmulationSession::GetInstance().ConfigureFilesystemProvider(GetJString(env, jpath)); 999 EmulationSession::GetInstance().ConfigureFilesystemProvider(
1000 Common::Android::GetJString(env, jpath));
990} 1001}
991 1002
992void Java_org_yuzu_yuzu_1emu_NativeLibrary_clearFilesystemProvider(JNIEnv* env, jobject jobj) { 1003void Java_org_yuzu_yuzu_1emu_NativeLibrary_clearFilesystemProvider(JNIEnv* env, jobject jobj) {
diff --git a/src/android/app/src/main/jni/native.h b/src/android/app/src/main/jni/native.h
index e49d4e015..47936e305 100755
--- a/src/android/app/src/main/jni/native.h
+++ b/src/android/app/src/main/jni/native.h
@@ -2,13 +2,13 @@
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-2.0-or-later
3 3
4#include <android/native_window_jni.h> 4#include <android/native_window_jni.h>
5#include "common/android/applets/software_keyboard.h"
5#include "common/detached_tasks.h" 6#include "common/detached_tasks.h"
6#include "core/core.h" 7#include "core/core.h"
7#include "core/file_sys/registered_cache.h" 8#include "core/file_sys/registered_cache.h"
8#include "core/hle/service/acc/profile_manager.h" 9#include "core/hle/service/acc/profile_manager.h"
9#include "core/perf_stats.h" 10#include "core/perf_stats.h"
10#include "frontend_common/content_manager.h" 11#include "frontend_common/content_manager.h"
11#include "jni/applets/software_keyboard.h"
12#include "jni/emu_window/emu_window.h" 12#include "jni/emu_window/emu_window.h"
13#include "video_core/rasterizer_interface.h" 13#include "video_core/rasterizer_interface.h"
14 14
@@ -54,7 +54,7 @@ public:
54 void SetDeviceType([[maybe_unused]] int index, int type); 54 void SetDeviceType([[maybe_unused]] int index, int type);
55 void OnGamepadConnectEvent([[maybe_unused]] int index); 55 void OnGamepadConnectEvent([[maybe_unused]] int index);
56 void OnGamepadDisconnectEvent([[maybe_unused]] int index); 56 void OnGamepadDisconnectEvent([[maybe_unused]] int index);
57 SoftwareKeyboard::AndroidKeyboard* SoftwareKeyboard(); 57 Common::Android::SoftwareKeyboard::AndroidKeyboard* SoftwareKeyboard();
58 58
59 static void OnEmulationStarted(); 59 static void OnEmulationStarted();
60 60
@@ -79,7 +79,7 @@ private:
79 Core::SystemResultStatus m_load_result{Core::SystemResultStatus::ErrorNotInitialized}; 79 Core::SystemResultStatus m_load_result{Core::SystemResultStatus::ErrorNotInitialized};
80 std::atomic<bool> m_is_running = false; 80 std::atomic<bool> m_is_running = false;
81 std::atomic<bool> m_is_paused = false; 81 std::atomic<bool> m_is_paused = false;
82 SoftwareKeyboard::AndroidKeyboard* m_software_keyboard{}; 82 Common::Android::SoftwareKeyboard::AndroidKeyboard* m_software_keyboard{};
83 std::unique_ptr<FileSys::ManualContentProvider> m_manual_provider; 83 std::unique_ptr<FileSys::ManualContentProvider> m_manual_provider;
84 int m_applet_id{1}; 84 int m_applet_id{1};
85 85
diff --git a/src/android/app/src/main/jni/native_config.cpp b/src/android/app/src/main/jni/native_config.cpp
index c6c3343dc..8ae10fbc7 100755
--- a/src/android/app/src/main/jni/native_config.cpp
+++ b/src/android/app/src/main/jni/native_config.cpp
@@ -8,11 +8,11 @@
8 8
9#include "android_config.h" 9#include "android_config.h"
10#include "android_settings.h" 10#include "android_settings.h"
11#include "common/android/android_common.h"
12#include "common/android/id_cache.h"
11#include "common/logging/log.h" 13#include "common/logging/log.h"
12#include "common/settings.h" 14#include "common/settings.h"
13#include "frontend_common/config.h" 15#include "frontend_common/config.h"
14#include "jni/android_common/android_common.h"
15#include "jni/id_cache.h"
16#include "native.h" 16#include "native.h"
17 17
18std::unique_ptr<AndroidConfig> global_config; 18std::unique_ptr<AndroidConfig> global_config;
@@ -20,7 +20,7 @@ std::unique_ptr<AndroidConfig> per_game_config;
20 20
21template <typename T> 21template <typename T>
22Settings::Setting<T>* getSetting(JNIEnv* env, jstring jkey) { 22Settings::Setting<T>* getSetting(JNIEnv* env, jstring jkey) {
23 auto key = GetJString(env, jkey); 23 auto key = Common::Android::GetJString(env, jkey);
24 auto basic_setting = Settings::values.linkage.by_key[key]; 24 auto basic_setting = Settings::values.linkage.by_key[key];
25 if (basic_setting != 0) { 25 if (basic_setting != 0) {
26 return static_cast<Settings::Setting<T>*>(basic_setting); 26 return static_cast<Settings::Setting<T>*>(basic_setting);
@@ -55,7 +55,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_initializePerGameConfig(JNIEnv*
55 jstring jprogramId, 55 jstring jprogramId,
56 jstring jfileName) { 56 jstring jfileName) {
57 auto program_id = EmulationSession::GetProgramId(env, jprogramId); 57 auto program_id = EmulationSession::GetProgramId(env, jprogramId);
58 auto file_name = GetJString(env, jfileName); 58 auto file_name = Common::Android::GetJString(env, jfileName);
59 const auto config_file_name = program_id == 0 ? file_name : fmt::format("{:016X}", program_id); 59 const auto config_file_name = program_id == 0 ? file_name : fmt::format("{:016X}", program_id);
60 per_game_config = 60 per_game_config =
61 std::make_unique<AndroidConfig>(config_file_name, Config::ConfigType::PerGameConfig); 61 std::make_unique<AndroidConfig>(config_file_name, Config::ConfigType::PerGameConfig);
@@ -186,9 +186,9 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getString(JNIEnv* env, jobjec
186 jboolean needGlobal) { 186 jboolean needGlobal) {
187 auto setting = getSetting<std::string>(env, jkey); 187 auto setting = getSetting<std::string>(env, jkey);
188 if (setting == nullptr) { 188 if (setting == nullptr) {
189 return ToJString(env, ""); 189 return Common::Android::ToJString(env, "");
190 } 190 }
191 return ToJString(env, setting->GetValue(static_cast<bool>(needGlobal))); 191 return Common::Android::ToJString(env, setting->GetValue(static_cast<bool>(needGlobal)));
192} 192}
193 193
194void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject obj, jstring jkey, 194void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject obj, jstring jkey,
@@ -198,7 +198,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject o
198 return; 198 return;
199 } 199 }
200 200
201 setting->SetValue(GetJString(env, value)); 201 setting->SetValue(Common::Android::GetJString(env, value));
202} 202}
203 203
204jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsRuntimeModifiable(JNIEnv* env, jobject obj, 204jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsRuntimeModifiable(JNIEnv* env, jobject obj,
@@ -214,13 +214,13 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getPairedSettingKey(JNIEnv* e
214 jstring jkey) { 214 jstring jkey) {
215 auto setting = getSetting<std::string>(env, jkey); 215 auto setting = getSetting<std::string>(env, jkey);
216 if (setting == nullptr) { 216 if (setting == nullptr) {
217 return ToJString(env, ""); 217 return Common::Android::ToJString(env, "");
218 } 218 }
219 if (setting->PairedSetting() == nullptr) { 219 if (setting->PairedSetting() == nullptr) {
220 return ToJString(env, ""); 220 return Common::Android::ToJString(env, "");
221 } 221 }
222 222
223 return ToJString(env, setting->PairedSetting()->GetLabel()); 223 return Common::Android::ToJString(env, setting->PairedSetting()->GetLabel());
224} 224}
225 225
226jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsSwitchable(JNIEnv* env, jobject obj, 226jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsSwitchable(JNIEnv* env, jobject obj,
@@ -262,21 +262,21 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getDefaultToString(JNIEnv* en
262 jstring jkey) { 262 jstring jkey) {
263 auto setting = getSetting<std::string>(env, jkey); 263 auto setting = getSetting<std::string>(env, jkey);
264 if (setting != nullptr) { 264 if (setting != nullptr) {
265 return ToJString(env, setting->DefaultToString()); 265 return Common::Android::ToJString(env, setting->DefaultToString());
266 } 266 }
267 return ToJString(env, ""); 267 return Common::Android::ToJString(env, "");
268} 268}
269 269
270jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getGameDirs(JNIEnv* env, jobject obj) { 270jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getGameDirs(JNIEnv* env, jobject obj) {
271 jclass gameDirClass = IDCache::GetGameDirClass(); 271 jclass gameDirClass = Common::Android::GetGameDirClass();
272 jmethodID gameDirConstructor = IDCache::GetGameDirConstructor(); 272 jmethodID gameDirConstructor = Common::Android::GetGameDirConstructor();
273 jobjectArray jgameDirArray = 273 jobjectArray jgameDirArray =
274 env->NewObjectArray(AndroidSettings::values.game_dirs.size(), gameDirClass, nullptr); 274 env->NewObjectArray(AndroidSettings::values.game_dirs.size(), gameDirClass, nullptr);
275 for (size_t i = 0; i < AndroidSettings::values.game_dirs.size(); ++i) { 275 for (size_t i = 0; i < AndroidSettings::values.game_dirs.size(); ++i) {
276 jobject jgameDir = 276 jobject jgameDir = env->NewObject(
277 env->NewObject(gameDirClass, gameDirConstructor, 277 gameDirClass, gameDirConstructor,
278 ToJString(env, AndroidSettings::values.game_dirs[i].path), 278 Common::Android::ToJString(env, AndroidSettings::values.game_dirs[i].path),
279 static_cast<jboolean>(AndroidSettings::values.game_dirs[i].deep_scan)); 279 static_cast<jboolean>(AndroidSettings::values.game_dirs[i].deep_scan));
280 env->SetObjectArrayElement(jgameDirArray, i, jgameDir); 280 env->SetObjectArrayElement(jgameDirArray, i, jgameDir);
281 } 281 }
282 return jgameDirArray; 282 return jgameDirArray;
@@ -292,14 +292,14 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setGameDirs(JNIEnv* env, jobject
292 } 292 }
293 293
294 jobject dir = env->GetObjectArrayElement(gameDirs, 0); 294 jobject dir = env->GetObjectArrayElement(gameDirs, 0);
295 jclass gameDirClass = IDCache::GetGameDirClass(); 295 jclass gameDirClass = Common::Android::GetGameDirClass();
296 jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); 296 jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;");
297 jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); 297 jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z");
298 for (int i = 0; i < size; ++i) { 298 for (int i = 0; i < size; ++i) {
299 dir = env->GetObjectArrayElement(gameDirs, i); 299 dir = env->GetObjectArrayElement(gameDirs, i);
300 jstring juriString = static_cast<jstring>(env->GetObjectField(dir, uriStringField)); 300 jstring juriString = static_cast<jstring>(env->GetObjectField(dir, uriStringField));
301 jboolean jdeepScanBoolean = env->GetBooleanField(dir, deepScanBooleanField); 301 jboolean jdeepScanBoolean = env->GetBooleanField(dir, deepScanBooleanField);
302 std::string uriString = GetJString(env, juriString); 302 std::string uriString = Common::Android::GetJString(env, juriString);
303 AndroidSettings::values.game_dirs.push_back( 303 AndroidSettings::values.game_dirs.push_back(
304 AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); 304 AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)});
305 } 305 }
@@ -307,13 +307,13 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setGameDirs(JNIEnv* env, jobject
307 307
308void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_addGameDir(JNIEnv* env, jobject obj, 308void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_addGameDir(JNIEnv* env, jobject obj,
309 jobject gameDir) { 309 jobject gameDir) {
310 jclass gameDirClass = IDCache::GetGameDirClass(); 310 jclass gameDirClass = Common::Android::GetGameDirClass();
311 jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); 311 jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;");
312 jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); 312 jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z");
313 313
314 jstring juriString = static_cast<jstring>(env->GetObjectField(gameDir, uriStringField)); 314 jstring juriString = static_cast<jstring>(env->GetObjectField(gameDir, uriStringField));
315 jboolean jdeepScanBoolean = env->GetBooleanField(gameDir, deepScanBooleanField); 315 jboolean jdeepScanBoolean = env->GetBooleanField(gameDir, deepScanBooleanField);
316 std::string uriString = GetJString(env, juriString); 316 std::string uriString = Common::Android::GetJString(env, juriString);
317 AndroidSettings::values.game_dirs.push_back( 317 AndroidSettings::values.game_dirs.push_back(
318 AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); 318 AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)});
319} 319}
@@ -323,9 +323,11 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getDisabledAddons(JNIEnv
323 auto program_id = EmulationSession::GetProgramId(env, jprogramId); 323 auto program_id = EmulationSession::GetProgramId(env, jprogramId);
324 auto& disabledAddons = Settings::values.disabled_addons[program_id]; 324 auto& disabledAddons = Settings::values.disabled_addons[program_id];
325 jobjectArray jdisabledAddonsArray = 325 jobjectArray jdisabledAddonsArray =
326 env->NewObjectArray(disabledAddons.size(), IDCache::GetStringClass(), ToJString(env, "")); 326 env->NewObjectArray(disabledAddons.size(), Common::Android::GetStringClass(),
327 Common::Android::ToJString(env, ""));
327 for (size_t i = 0; i < disabledAddons.size(); ++i) { 328 for (size_t i = 0; i < disabledAddons.size(); ++i) {
328 env->SetObjectArrayElement(jdisabledAddonsArray, i, ToJString(env, disabledAddons[i])); 329 env->SetObjectArrayElement(jdisabledAddonsArray, i,
330 Common::Android::ToJString(env, disabledAddons[i]));
329 } 331 }
330 return jdisabledAddonsArray; 332 return jdisabledAddonsArray;
331} 333}
@@ -339,7 +341,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setDisabledAddons(JNIEnv* env, j
339 const int size = env->GetArrayLength(jdisabledAddons); 341 const int size = env->GetArrayLength(jdisabledAddons);
340 for (int i = 0; i < size; ++i) { 342 for (int i = 0; i < size; ++i) {
341 auto jaddon = static_cast<jstring>(env->GetObjectArrayElement(jdisabledAddons, i)); 343 auto jaddon = static_cast<jstring>(env->GetObjectArrayElement(jdisabledAddons, i));
342 disabled_addons.push_back(GetJString(env, jaddon)); 344 disabled_addons.push_back(Common::Android::GetJString(env, jaddon));
343 } 345 }
344 Settings::values.disabled_addons[program_id] = disabled_addons; 346 Settings::values.disabled_addons[program_id] = disabled_addons;
345} 347}
@@ -348,26 +350,27 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getOverlayControlData(JN
348 jobject obj) { 350 jobject obj) {
349 jobjectArray joverlayControlDataArray = 351 jobjectArray joverlayControlDataArray =
350 env->NewObjectArray(AndroidSettings::values.overlay_control_data.size(), 352 env->NewObjectArray(AndroidSettings::values.overlay_control_data.size(),
351 IDCache::GetOverlayControlDataClass(), nullptr); 353 Common::Android::GetOverlayControlDataClass(), nullptr);
352 for (size_t i = 0; i < AndroidSettings::values.overlay_control_data.size(); ++i) { 354 for (size_t i = 0; i < AndroidSettings::values.overlay_control_data.size(); ++i) {
353 const auto& control_data = AndroidSettings::values.overlay_control_data[i]; 355 const auto& control_data = AndroidSettings::values.overlay_control_data[i];
354 jobject jlandscapePosition = 356 jobject jlandscapePosition =
355 env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), 357 env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(),
356 ToJDouble(env, control_data.landscape_position.first), 358 Common::Android::ToJDouble(env, control_data.landscape_position.first),
357 ToJDouble(env, control_data.landscape_position.second)); 359 Common::Android::ToJDouble(env, control_data.landscape_position.second));
358 jobject jportraitPosition = 360 jobject jportraitPosition =
359 env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), 361 env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(),
360 ToJDouble(env, control_data.portrait_position.first), 362 Common::Android::ToJDouble(env, control_data.portrait_position.first),
361 ToJDouble(env, control_data.portrait_position.second)); 363 Common::Android::ToJDouble(env, control_data.portrait_position.second));
362 jobject jfoldablePosition = 364 jobject jfoldablePosition =
363 env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), 365 env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(),
364 ToJDouble(env, control_data.foldable_position.first), 366 Common::Android::ToJDouble(env, control_data.foldable_position.first),
365 ToJDouble(env, control_data.foldable_position.second)); 367 Common::Android::ToJDouble(env, control_data.foldable_position.second));
366 368
367 jobject jcontrolData = env->NewObject( 369 jobject jcontrolData =
368 IDCache::GetOverlayControlDataClass(), IDCache::GetOverlayControlDataConstructor(), 370 env->NewObject(Common::Android::GetOverlayControlDataClass(),
369 ToJString(env, control_data.id), control_data.enabled, jlandscapePosition, 371 Common::Android::GetOverlayControlDataConstructor(),
370 jportraitPosition, jfoldablePosition); 372 Common::Android::ToJString(env, control_data.id), control_data.enabled,
373 jlandscapePosition, jportraitPosition, jfoldablePosition);
371 env->SetObjectArrayElement(joverlayControlDataArray, i, jcontrolData); 374 env->SetObjectArrayElement(joverlayControlDataArray, i, jcontrolData);
372 } 375 }
373 return joverlayControlDataArray; 376 return joverlayControlDataArray;
@@ -384,33 +387,41 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setOverlayControlData(
384 387
385 for (int i = 0; i < size; ++i) { 388 for (int i = 0; i < size; ++i) {
386 jobject joverlayControlData = env->GetObjectArrayElement(joverlayControlDataArray, i); 389 jobject joverlayControlData = env->GetObjectArrayElement(joverlayControlDataArray, i);
387 jstring jidString = static_cast<jstring>( 390 jstring jidString = static_cast<jstring>(env->GetObjectField(
388 env->GetObjectField(joverlayControlData, IDCache::GetOverlayControlDataIdField())); 391 joverlayControlData, Common::Android::GetOverlayControlDataIdField()));
389 bool enabled = static_cast<bool>(env->GetBooleanField( 392 bool enabled = static_cast<bool>(env->GetBooleanField(
390 joverlayControlData, IDCache::GetOverlayControlDataEnabledField())); 393 joverlayControlData, Common::Android::GetOverlayControlDataEnabledField()));
391 394
392 jobject jlandscapePosition = env->GetObjectField( 395 jobject jlandscapePosition = env->GetObjectField(
393 joverlayControlData, IDCache::GetOverlayControlDataLandscapePositionField()); 396 joverlayControlData, Common::Android::GetOverlayControlDataLandscapePositionField());
394 std::pair<double, double> landscape_position = std::make_pair( 397 std::pair<double, double> landscape_position = std::make_pair(
395 GetJDouble(env, env->GetObjectField(jlandscapePosition, IDCache::GetPairFirstField())), 398 Common::Android::GetJDouble(
396 GetJDouble(env, 399 env, env->GetObjectField(jlandscapePosition, Common::Android::GetPairFirstField())),
397 env->GetObjectField(jlandscapePosition, IDCache::GetPairSecondField()))); 400 Common::Android::GetJDouble(
401 env,
402 env->GetObjectField(jlandscapePosition, Common::Android::GetPairSecondField())));
398 403
399 jobject jportraitPosition = env->GetObjectField( 404 jobject jportraitPosition = env->GetObjectField(
400 joverlayControlData, IDCache::GetOverlayControlDataPortraitPositionField()); 405 joverlayControlData, Common::Android::GetOverlayControlDataPortraitPositionField());
401 std::pair<double, double> portrait_position = std::make_pair( 406 std::pair<double, double> portrait_position = std::make_pair(
402 GetJDouble(env, env->GetObjectField(jportraitPosition, IDCache::GetPairFirstField())), 407 Common::Android::GetJDouble(
403 GetJDouble(env, env->GetObjectField(jportraitPosition, IDCache::GetPairSecondField()))); 408 env, env->GetObjectField(jportraitPosition, Common::Android::GetPairFirstField())),
409 Common::Android::GetJDouble(
410 env,
411 env->GetObjectField(jportraitPosition, Common::Android::GetPairSecondField())));
404 412
405 jobject jfoldablePosition = env->GetObjectField( 413 jobject jfoldablePosition = env->GetObjectField(
406 joverlayControlData, IDCache::GetOverlayControlDataFoldablePositionField()); 414 joverlayControlData, Common::Android::GetOverlayControlDataFoldablePositionField());
407 std::pair<double, double> foldable_position = std::make_pair( 415 std::pair<double, double> foldable_position = std::make_pair(
408 GetJDouble(env, env->GetObjectField(jfoldablePosition, IDCache::GetPairFirstField())), 416 Common::Android::GetJDouble(
409 GetJDouble(env, env->GetObjectField(jfoldablePosition, IDCache::GetPairSecondField()))); 417 env, env->GetObjectField(jfoldablePosition, Common::Android::GetPairFirstField())),
418 Common::Android::GetJDouble(
419 env,
420 env->GetObjectField(jfoldablePosition, Common::Android::GetPairSecondField())));
410 421
411 AndroidSettings::values.overlay_control_data.push_back(AndroidSettings::OverlayControlData{ 422 AndroidSettings::values.overlay_control_data.push_back(AndroidSettings::OverlayControlData{
412 GetJString(env, jidString), enabled, landscape_position, portrait_position, 423 Common::Android::GetJString(env, jidString), enabled, landscape_position,
413 foldable_position}); 424 portrait_position, foldable_position});
414 } 425 }
415} 426}
416 427
diff --git a/src/android/app/src/main/jni/native_log.cpp b/src/android/app/src/main/jni/native_log.cpp
index 33d691dc8..95dd1f057 100755
--- a/src/android/app/src/main/jni/native_log.cpp
+++ b/src/android/app/src/main/jni/native_log.cpp
@@ -1,31 +1,30 @@
1// SPDX-FileCopyrightText: 2023 yuzu Emulator Project 1// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-2.0-or-later
3 3
4#include <common/android/android_common.h>
4#include <common/logging/log.h> 5#include <common/logging/log.h>
5#include <jni.h> 6#include <jni.h>
6 7
7#include "android_common/android_common.h"
8
9extern "C" { 8extern "C" {
10 9
11void Java_org_yuzu_yuzu_1emu_utils_Log_debug(JNIEnv* env, jobject obj, jstring jmessage) { 10void Java_org_yuzu_yuzu_1emu_utils_Log_debug(JNIEnv* env, jobject obj, jstring jmessage) {
12 LOG_DEBUG(Frontend, "{}", GetJString(env, jmessage)); 11 LOG_DEBUG(Frontend, "{}", Common::Android::GetJString(env, jmessage));
13} 12}
14 13
15void Java_org_yuzu_yuzu_1emu_utils_Log_warning(JNIEnv* env, jobject obj, jstring jmessage) { 14void Java_org_yuzu_yuzu_1emu_utils_Log_warning(JNIEnv* env, jobject obj, jstring jmessage) {
16 LOG_WARNING(Frontend, "{}", GetJString(env, jmessage)); 15 LOG_WARNING(Frontend, "{}", Common::Android::GetJString(env, jmessage));
17} 16}
18 17
19void Java_org_yuzu_yuzu_1emu_utils_Log_info(JNIEnv* env, jobject obj, jstring jmessage) { 18void Java_org_yuzu_yuzu_1emu_utils_Log_info(JNIEnv* env, jobject obj, jstring jmessage) {
20 LOG_INFO(Frontend, "{}", GetJString(env, jmessage)); 19 LOG_INFO(Frontend, "{}", Common::Android::GetJString(env, jmessage));
21} 20}
22 21
23void Java_org_yuzu_yuzu_1emu_utils_Log_error(JNIEnv* env, jobject obj, jstring jmessage) { 22void Java_org_yuzu_yuzu_1emu_utils_Log_error(JNIEnv* env, jobject obj, jstring jmessage) {
24 LOG_ERROR(Frontend, "{}", GetJString(env, jmessage)); 23 LOG_ERROR(Frontend, "{}", Common::Android::GetJString(env, jmessage));
25} 24}
26 25
27void Java_org_yuzu_yuzu_1emu_utils_Log_critical(JNIEnv* env, jobject obj, jstring jmessage) { 26void Java_org_yuzu_yuzu_1emu_utils_Log_critical(JNIEnv* env, jobject obj, jstring jmessage) {
28 LOG_CRITICAL(Frontend, "{}", GetJString(env, jmessage)); 27 LOG_CRITICAL(Frontend, "{}", Common::Android::GetJString(env, jmessage));
29} 28}
30 29
31} // extern "C" 30} // extern "C"
diff --git a/src/android/app/src/main/res/layout/fragment_emulation.xml b/src/android/app/src/main/res/layout/fragment_emulation.xml
index 0d2bfe8d6..e99a15783 100755
--- a/src/android/app/src/main/res/layout/fragment_emulation.xml
+++ b/src/android/app/src/main/res/layout/fragment_emulation.xml
@@ -140,6 +140,7 @@
140 android:id="@+id/overlay_container" 140 android:id="@+id/overlay_container"
141 android:layout_width="match_parent" 141 android:layout_width="match_parent"
142 android:layout_height="match_parent" 142 android:layout_height="match_parent"
143 android:layout_marginHorizontal="20dp"
143 android:fitsSystemWindows="true"> 144 android:fitsSystemWindows="true">
144 145
145 <com.google.android.material.textview.MaterialTextView 146 <com.google.android.material.textview.MaterialTextView
@@ -150,7 +151,19 @@
150 android:layout_gravity="left" 151 android:layout_gravity="left"
151 android:clickable="false" 152 android:clickable="false"
152 android:focusable="false" 153 android:focusable="false"
153 android:paddingHorizontal="20dp" 154 android:textColor="@android:color/white"
155 android:shadowColor="@android:color/black"
156 android:shadowRadius="3"
157 tools:ignore="RtlHardcoded" />
158
159 <com.google.android.material.textview.MaterialTextView
160 android:id="@+id/show_thermals_text"
161 style="@style/TextAppearance.Material3.BodySmall"
162 android:layout_width="wrap_content"
163 android:layout_height="wrap_content"
164 android:layout_gravity="right"
165 android:clickable="false"
166 android:focusable="false"
154 android:textColor="@android:color/white" 167 android:textColor="@android:color/white"
155 android:shadowColor="@android:color/black" 168 android:shadowColor="@android:color/black"
156 android:shadowRadius="3" 169 android:shadowRadius="3"
diff --git a/src/android/app/src/main/res/menu/menu_overlay_options.xml b/src/android/app/src/main/res/menu/menu_overlay_options.xml
index 363781652..a9e807427 100755
--- a/src/android/app/src/main/res/menu/menu_overlay_options.xml
+++ b/src/android/app/src/main/res/menu/menu_overlay_options.xml
@@ -7,6 +7,11 @@
7 android:checkable="true" /> 7 android:checkable="true" />
8 8
9 <item 9 <item
10 android:id="@+id/thermal_indicator"
11 android:title="@string/emulation_thermal_indicator"
12 android:checkable="true" />
13
14 <item
10 android:id="@+id/menu_edit_overlay" 15 android:id="@+id/menu_edit_overlay"
11 android:title="@string/emulation_touch_overlay_edit" /> 16 android:title="@string/emulation_touch_overlay_edit" />
12 17
diff --git a/src/android/app/src/main/res/values-ar/strings.xml b/src/android/app/src/main/res/values-ar/strings.xml
index 53678f465..41d741847 100755
--- a/src/android/app/src/main/res/values-ar/strings.xml
+++ b/src/android/app/src/main/res/values-ar/strings.xml
@@ -1,9 +1,6 @@
1<?xml version="1.0" encoding="utf-8"?> 1<?xml version="1.0" encoding="utf-8"?>
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="emulation_notification_channel_name">المحاكي نشط</string>
5 <string name="emulation_notification_channel_description">اظهار اشعار دائم عندما يكون المحاكي نشطاً</string>
6 <string name="emulation_notification_running">يوزو قيد التشغيل</string>
7 <string name="notice_notification_channel_name">الإشعارات والأخطاء</string> 4 <string name="notice_notification_channel_name">الإشعارات والأخطاء</string>
8 <string name="notice_notification_channel_description">اظهار اشعار عند حصول اي مشكلة.</string> 5 <string name="notice_notification_channel_description">اظهار اشعار عند حصول اي مشكلة.</string>
9 <string name="notification_permission_not_granted">لم يتم منح إذن الإشعار</string> 6 <string name="notification_permission_not_granted">لم يتم منح إذن الإشعار</string>
diff --git a/src/android/app/src/main/res/values-ckb/strings.xml b/src/android/app/src/main/res/values-ckb/strings.xml
index 7e1eb2b8d..827339505 100755
--- a/src/android/app/src/main/res/values-ckb/strings.xml
+++ b/src/android/app/src/main/res/values-ckb/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">ئەم نەرمەکاڵایە یارییەکانی کۆنسۆلی نینتێندۆ سویچ کارپێدەکات. هیچ ناونیشانێکی یاری و کلیلی تێدا نییە..&lt;br /&gt;&lt;br /&gt;پێش ئەوەی دەست پێ بکەیت، تکایە شوێنی فایلی <![CDATA[<b> prod.keys </b>]]> دیاریبکە لە نێو کۆگای ئامێرەکەت.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">زیاتر فێربە</a>]]></string> 4 <string name="app_disclaimer">ئەم نەرمەکاڵایە یارییەکانی کۆنسۆلی نینتێندۆ سویچ کارپێدەکات. هیچ ناونیشانێکی یاری و کلیلی تێدا نییە..&lt;br /&gt;&lt;br /&gt;پێش ئەوەی دەست پێ بکەیت، تکایە شوێنی فایلی <![CDATA[<b> prod.keys </b>]]> دیاریبکە لە نێو کۆگای ئامێرەکەت.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">زیاتر فێربە</a>]]></string>
5 <string name="emulation_notification_channel_name">ئیمولەیشن کارایە</string>
6 <string name="emulation_notification_channel_description">ئاگادارکردنەوەیەکی بەردەوام نیشان دەدات کاتێک ئیمولەیشن کاردەکات.</string>
7 <string name="emulation_notification_running">یوزو کاردەکات</string>
8 <string name="notice_notification_channel_name">ئاگاداری و هەڵەکان</string> 5 <string name="notice_notification_channel_name">ئاگاداری و هەڵەکان</string>
9 <string name="notice_notification_channel_description">ئاگادارکردنەوەکان پیشان دەدات کاتێک شتێک بە هەڵەدا دەچێت.</string> 6 <string name="notice_notification_channel_description">ئاگادارکردنەوەکان پیشان دەدات کاتێک شتێک بە هەڵەدا دەچێت.</string>
10 <string name="notification_permission_not_granted">مۆڵەتی ئاگادارکردنەوە نەدراوە!</string> 7 <string name="notification_permission_not_granted">مۆڵەتی ئاگادارکردنەوە نەدراوە!</string>
diff --git a/src/android/app/src/main/res/values-cs/strings.xml b/src/android/app/src/main/res/values-cs/strings.xml
index b9a4a11e4..8f8e2848d 100755
--- a/src/android/app/src/main/res/values-cs/strings.xml
+++ b/src/android/app/src/main/res/values-cs/strings.xml
@@ -1,7 +1,6 @@
1<?xml version="1.0" encoding="utf-8"?> 1<?xml version="1.0" encoding="utf-8"?>
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="emulation_notification_channel_name">Emulace je aktivní</string>
5 <string name="notice_notification_channel_name">Upozornění a chyby</string> 4 <string name="notice_notification_channel_name">Upozornění a chyby</string>
6 <string name="notice_notification_channel_description">Ukáže oznámení v případě chyby.</string> 5 <string name="notice_notification_channel_description">Ukáže oznámení v případě chyby.</string>
7 <string name="notification_permission_not_granted">Oznámení nejsou oprávněna!</string> 6 <string name="notification_permission_not_granted">Oznámení nejsou oprávněna!</string>
diff --git a/src/android/app/src/main/res/values-de/strings.xml b/src/android/app/src/main/res/values-de/strings.xml
index 483ea8c88..fb25b3c93 100755
--- a/src/android/app/src/main/res/values-de/strings.xml
+++ b/src/android/app/src/main/res/values-de/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Diese Software kann Spiele für die Nintendo Switch abspielen. Keine Spiele oder Spielekeys sind enthalten.&lt;br /&gt;&lt;br /&gt;Bevor du beginnst, bitte halte deine <![CDATA[<b> prod.keys </b>]]> auf deinem Gerät bereit. .&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Mehr Infos</a>]]></string> 4 <string name="app_disclaimer">Diese Software kann Spiele für die Nintendo Switch abspielen. Keine Spiele oder Spielekeys sind enthalten.&lt;br /&gt;&lt;br /&gt;Bevor du beginnst, bitte halte deine <![CDATA[<b> prod.keys </b>]]> auf deinem Gerät bereit. .&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Mehr Infos</a>]]></string>
5 <string name="emulation_notification_channel_name">Emulation ist aktiv</string>
6 <string name="emulation_notification_channel_description">Zeigt eine dauerhafte Benachrichtigung an, wenn die Emulation läuft.</string>
7 <string name="emulation_notification_running">yuzu läuft</string>
8 <string name="notice_notification_channel_name">Hinweise und Fehler</string> 5 <string name="notice_notification_channel_name">Hinweise und Fehler</string>
9 <string name="notice_notification_channel_description">Zeigt Benachrichtigungen an, wenn etwas schief läuft.</string> 6 <string name="notice_notification_channel_description">Zeigt Benachrichtigungen an, wenn etwas schief läuft.</string>
10 <string name="notification_permission_not_granted">Berechtigung für Benachrichtigungen nicht erlaubt!</string> 7 <string name="notification_permission_not_granted">Berechtigung für Benachrichtigungen nicht erlaubt!</string>
diff --git a/src/android/app/src/main/res/values-es/strings.xml b/src/android/app/src/main/res/values-es/strings.xml
index c3825710b..7ecbeaba4 100755
--- a/src/android/app/src/main/res/values-es/strings.xml
+++ b/src/android/app/src/main/res/values-es/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Este software ejecuta juegos para la videoconsola Nintendo Switch. Los videojuegos o claves no vienen incluidos.&lt;br /&gt;&lt;br /&gt;Antes de empezar, por favor, localice el archivo <![CDATA[<b> prod.keys </b>]]>en el almacenamiento de su dispositivo..&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saber más</a>]]></string> 4 <string name="app_disclaimer">Este software ejecuta juegos para la videoconsola Nintendo Switch. Los videojuegos o claves no vienen incluidos.&lt;br /&gt;&lt;br /&gt;Antes de empezar, por favor, localice el archivo <![CDATA[<b> prod.keys </b>]]>en el almacenamiento de su dispositivo..&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saber más</a>]]></string>
5 <string name="emulation_notification_channel_name">Emulación activa</string>
6 <string name="emulation_notification_channel_description">Muestra una notificación persistente cuando la emulación está activa.</string>
7 <string name="emulation_notification_running">yuzu está ejecutándose</string>
8 <string name="notice_notification_channel_name">Avisos y errores</string> 5 <string name="notice_notification_channel_name">Avisos y errores</string>
9 <string name="notice_notification_channel_description">Mostrar notificaciones cuándo algo vaya mal.</string> 6 <string name="notice_notification_channel_description">Mostrar notificaciones cuándo algo vaya mal.</string>
10 <string name="notification_permission_not_granted">¡Permisos de notificación no concedidos!</string> 7 <string name="notification_permission_not_granted">¡Permisos de notificación no concedidos!</string>
diff --git a/src/android/app/src/main/res/values-fr/strings.xml b/src/android/app/src/main/res/values-fr/strings.xml
index 667fe33cb..a848b9163 100755
--- a/src/android/app/src/main/res/values-fr/strings.xml
+++ b/src/android/app/src/main/res/values-fr/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Ce logiciel exécutera des jeux pour la console de jeu Nintendo Switch. Aucun jeux ou clés n\'est inclus.&lt;br /&gt;&lt;br /&gt;Avant de commencer, veuillez localiser votre fichier <![CDATA[<b> prod.keys </b>]]> sur le stockage de votre appareil.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">En savoir plus</a>]]></string> 4 <string name="app_disclaimer">Ce logiciel exécutera des jeux pour la console de jeu Nintendo Switch. Aucun jeux ou clés n\'est inclus.&lt;br /&gt;&lt;br /&gt;Avant de commencer, veuillez localiser votre fichier <![CDATA[<b> prod.keys </b>]]> sur le stockage de votre appareil.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">En savoir plus</a>]]></string>
5 <string name="emulation_notification_channel_name">L\'émulation est active</string>
6 <string name="emulation_notification_channel_description">Affiche une notification persistante lorsque l\'émulation est en cours d\'exécution.</string>
7 <string name="emulation_notification_running">yuzu est en cours d\'exécution</string>
8 <string name="notice_notification_channel_name">Avis et erreurs</string> 5 <string name="notice_notification_channel_name">Avis et erreurs</string>
9 <string name="notice_notification_channel_description">Affiche des notifications en cas de problème.</string> 6 <string name="notice_notification_channel_description">Affiche des notifications en cas de problème.</string>
10 <string name="notification_permission_not_granted">Permission de notification non accordée !</string> 7 <string name="notification_permission_not_granted">Permission de notification non accordée !</string>
diff --git a/src/android/app/src/main/res/values-he/strings.xml b/src/android/app/src/main/res/values-he/strings.xml
index 41e4450c6..6096605a9 100755
--- a/src/android/app/src/main/res/values-he/strings.xml
+++ b/src/android/app/src/main/res/values-he/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">התוכנה תריץ משחקים לקונסולת ה Nintendo Switch. אף משחק או קבצים בעלי זכויות יוצרים נכללים.&lt;br /&gt;&lt;br /&gt; לפני שאת/ה מתחיל בבקשה מצא את קובץ <![CDATA[<b>prod.keys</b>]]> על המכשיר.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">קרא עוד</a>]]></string> 4 <string name="app_disclaimer">התוכנה תריץ משחקים לקונסולת ה Nintendo Switch. אף משחק או קבצים בעלי זכויות יוצרים נכללים.&lt;br /&gt;&lt;br /&gt; לפני שאת/ה מתחיל בבקשה מצא את קובץ <![CDATA[<b>prod.keys</b>]]> על המכשיר.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">קרא עוד</a>]]></string>
5 <string name="emulation_notification_channel_name">אמולציה פעילה</string>
6 <string name="emulation_notification_channel_description">מציג התראה מתמשכת כאשר האמולציה פועלת.</string>
7 <string name="emulation_notification_running">yuzu רץ</string>
8 <string name="notice_notification_channel_name">התראות ותקלות</string> 5 <string name="notice_notification_channel_name">התראות ותקלות</string>
9 <string name="notice_notification_channel_description">מציג התראות כאשר משהו הולך לא כשורה.</string> 6 <string name="notice_notification_channel_description">מציג התראות כאשר משהו הולך לא כשורה.</string>
10 <string name="notification_permission_not_granted">הרשאות התראות לא ניתנה!</string> 7 <string name="notification_permission_not_granted">הרשאות התראות לא ניתנה!</string>
diff --git a/src/android/app/src/main/res/values-hu/strings.xml b/src/android/app/src/main/res/values-hu/strings.xml
index 554da0816..f3a29e0c3 100755
--- a/src/android/app/src/main/res/values-hu/strings.xml
+++ b/src/android/app/src/main/res/values-hu/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Ez a szoftver Nintendo Switch játékkonzolhoz készült játékokat futtat. Nem tartalmaz játékokat vagy kulcsokat. .&lt;br /&gt;&lt;br /&gt;Mielőtt hozzákezdenél, kérjük, válaszd ki a <![CDATA[<b>prod.keys</b>]]> fájl helyét a készülék tárhelyén&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tudj meg többet</a>]]></string> 4 <string name="app_disclaimer">Ez a szoftver Nintendo Switch játékkonzolhoz készült játékokat futtat. Nem tartalmaz játékokat vagy kulcsokat. .&lt;br /&gt;&lt;br /&gt;Mielőtt hozzákezdenél, kérjük, válaszd ki a <![CDATA[<b>prod.keys</b>]]> fájl helyét a készülék tárhelyén&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tudj meg többet</a>]]></string>
5 <string name="emulation_notification_channel_name">Emuláció aktív</string>
6 <string name="emulation_notification_channel_description">Állandó értesítést jelenít meg, amíg az emuláció fut.</string>
7 <string name="emulation_notification_running">A yuzu fut</string>
8 <string name="notice_notification_channel_name">Megjegyzések és hibák</string> 5 <string name="notice_notification_channel_name">Megjegyzések és hibák</string>
9 <string name="notice_notification_channel_description">Értesítések megjelenítése, ha valami rosszul sül el.</string> 6 <string name="notice_notification_channel_description">Értesítések megjelenítése, ha valami rosszul sül el.</string>
10 <string name="notification_permission_not_granted">Nincs engedély az értesítés megjelenítéséhez!</string> 7 <string name="notification_permission_not_granted">Nincs engedély az értesítés megjelenítéséhez!</string>
diff --git a/src/android/app/src/main/res/values-it/strings.xml b/src/android/app/src/main/res/values-it/strings.xml
index 61b39f57f..433d84f5c 100755
--- a/src/android/app/src/main/res/values-it/strings.xml
+++ b/src/android/app/src/main/res/values-it/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Questo software permette di giocare ai giochi della console Nintendo Switch. Nessun gioco o chiave è inclusa.&lt;br /&gt;&lt;br /&gt;Prima di iniziare, perfavore individua il file <![CDATA[<b>prod.keys </b>]]> nella memoria del tuo dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Scopri di più</a>]]></string> 4 <string name="app_disclaimer">Questo software permette di giocare ai giochi della console Nintendo Switch. Nessun gioco o chiave è inclusa.&lt;br /&gt;&lt;br /&gt;Prima di iniziare, perfavore individua il file <![CDATA[<b>prod.keys </b>]]> nella memoria del tuo dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Scopri di più</a>]]></string>
5 <string name="emulation_notification_channel_name">L\'emulatore è attivo</string>
6 <string name="emulation_notification_channel_description">Mostra una notifica persistente quando l\'emulatore è in esecuzione.</string>
7 <string name="emulation_notification_running">yuzu è in esecuzione</string>
8 <string name="notice_notification_channel_name">Avvisi ed errori</string> 5 <string name="notice_notification_channel_name">Avvisi ed errori</string>
9 <string name="notice_notification_channel_description">Mostra le notifiche quando qualcosa va storto.</string> 6 <string name="notice_notification_channel_description">Mostra le notifiche quando qualcosa va storto.</string>
10 <string name="notification_permission_not_granted">Autorizzazione di notifica non concessa!</string> 7 <string name="notification_permission_not_granted">Autorizzazione di notifica non concessa!</string>
diff --git a/src/android/app/src/main/res/values-ja/strings.xml b/src/android/app/src/main/res/values-ja/strings.xml
index 0cff40bb6..da73ad651 100755
--- a/src/android/app/src/main/res/values-ja/strings.xml
+++ b/src/android/app/src/main/res/values-ja/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">このソフトウェアでは、Nintendo Switchのゲームを実行できます。 ゲームソフトやキーは含まれません。&lt;br /&gt;&lt;br /&gt;事前に、 <![CDATA[<b> prod.keys </b>]]> ファイルをストレージに配置しておいてください。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">詳細</a>]]></string> 4 <string name="app_disclaimer">このソフトウェアでは、Nintendo Switchのゲームを実行できます。 ゲームソフトやキーは含まれません。&lt;br /&gt;&lt;br /&gt;事前に、 <![CDATA[<b> prod.keys </b>]]> ファイルをストレージに配置しておいてください。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">詳細</a>]]></string>
5 <string name="emulation_notification_channel_name">エミュレーションが有効です</string>
6 <string name="emulation_notification_channel_description">エミュレーションの実行中に常設通知を表示します。</string>
7 <string name="emulation_notification_running">yuzu は実行中です</string>
8 <string name="notice_notification_channel_name">通知とエラー</string> 5 <string name="notice_notification_channel_name">通知とエラー</string>
9 <string name="notice_notification_channel_description">問題の発生時に通知を表示します。</string> 6 <string name="notice_notification_channel_description">問題の発生時に通知を表示します。</string>
10 <string name="notification_permission_not_granted">通知が許可されていません!</string> 7 <string name="notification_permission_not_granted">通知が許可されていません!</string>
diff --git a/src/android/app/src/main/res/values-ko/strings.xml b/src/android/app/src/main/res/values-ko/strings.xml
index eaa6c23ce..904353d34 100755
--- a/src/android/app/src/main/res/values-ko/strings.xml
+++ b/src/android/app/src/main/res/values-ko/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">이 소프트웨어는 Nintendo Switch 게임을 실행합니다. 게임 타이틀이나 키는 포함되어 있지 않습니다.&lt;br /&gt;&lt;br /&gt;시작하기 전에 장치 저장소에서 <![CDATA[<b> prod.keys </b>]]> 파일을 찾아주세요.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">자세히 알아보기</a>]]></string> 4 <string name="app_disclaimer">이 소프트웨어는 Nintendo Switch 게임을 실행합니다. 게임 타이틀이나 키는 포함되어 있지 않습니다.&lt;br /&gt;&lt;br /&gt;시작하기 전에 장치 저장소에서 <![CDATA[<b> prod.keys </b>]]> 파일을 찾아주세요.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">자세히 알아보기</a>]]></string>
5 <string name="emulation_notification_channel_name">에뮬레이션이 활성화됨</string>
6 <string name="emulation_notification_channel_description">에뮬레이션이 실행 중일 때 지속적으로 알림을 표시합니다.</string>
7 <string name="emulation_notification_running">yuzu가 실행 중입니다.</string>
8 <string name="notice_notification_channel_name">알림 및 오류</string> 5 <string name="notice_notification_channel_name">알림 및 오류</string>
9 <string name="notice_notification_channel_description">문제가 발생하면 알림을 표시합니다.</string> 6 <string name="notice_notification_channel_description">문제가 발생하면 알림을 표시합니다.</string>
10 <string name="notification_permission_not_granted">알림 권한이 부여되지 않았습니다!</string> 7 <string name="notification_permission_not_granted">알림 권한이 부여되지 않았습니다!</string>
diff --git a/src/android/app/src/main/res/values-nb/strings.xml b/src/android/app/src/main/res/values-nb/strings.xml
index e92dc62d9..fe3af5920 100755
--- a/src/android/app/src/main/res/values-nb/strings.xml
+++ b/src/android/app/src/main/res/values-nb/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Denne programvaren vil kjøre spill for Nintendo Switch-spillkonsollen. Ingen spilltitler eller nøkler er inkludert.&lt;br /&gt;&lt;br /&gt;Før du begynner, må du finne <![CDATA[<b> prod.keys </b>]]> filen din på enhetslagringen.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Lær mer</a>]]></string> 4 <string name="app_disclaimer">Denne programvaren vil kjøre spill for Nintendo Switch-spillkonsollen. Ingen spilltitler eller nøkler er inkludert.&lt;br /&gt;&lt;br /&gt;Før du begynner, må du finne <![CDATA[<b> prod.keys </b>]]> filen din på enhetslagringen.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Lær mer</a>]]></string>
5 <string name="emulation_notification_channel_name">Emulering er aktiv</string>
6 <string name="emulation_notification_channel_description">Viser et vedvarende varsel når emuleringen kjører.</string>
7 <string name="emulation_notification_running">Yuzu kjører</string>
8 <string name="notice_notification_channel_name">Merknader og feil</string> 5 <string name="notice_notification_channel_name">Merknader og feil</string>
9 <string name="notice_notification_channel_description">Viser varsler når noe går galt.</string> 6 <string name="notice_notification_channel_description">Viser varsler når noe går galt.</string>
10 <string name="notification_permission_not_granted">Varslingstillatelse ikke gitt!</string> 7 <string name="notification_permission_not_granted">Varslingstillatelse ikke gitt!</string>
diff --git a/src/android/app/src/main/res/values-pl/strings.xml b/src/android/app/src/main/res/values-pl/strings.xml
index fbd0ad7e9..2af7fd7b4 100755
--- a/src/android/app/src/main/res/values-pl/strings.xml
+++ b/src/android/app/src/main/res/values-pl/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">To oprogramowanie umożliwia uruchomienie gier z konsoli Nintendo Switch. Nie zawiera gier ani wymaganych kluczy.&lt;br /&gt;&lt;br /&gt;Zanim zaczniesz, wybierz plik kluczy <![CDATA[<b> prod.keys </b>]]> z katalogu w pamięci masowej.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Dowiedz się więcej</a>]]></string> 4 <string name="app_disclaimer">To oprogramowanie umożliwia uruchomienie gier z konsoli Nintendo Switch. Nie zawiera gier ani wymaganych kluczy.&lt;br /&gt;&lt;br /&gt;Zanim zaczniesz, wybierz plik kluczy <![CDATA[<b> prod.keys </b>]]> z katalogu w pamięci masowej.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Dowiedz się więcej</a>]]></string>
5 <string name="emulation_notification_channel_name">Emulacja jest uruchomiona</string>
6 <string name="emulation_notification_channel_description">Pokaż trwałe powiadomienie gdy emulacja jest uruchomiona.</string>
7 <string name="emulation_notification_running">yuzu jest uruchomiony</string>
8 <string name="notice_notification_channel_name">Powiadomienia błędy</string> 5 <string name="notice_notification_channel_name">Powiadomienia błędy</string>
9 <string name="notice_notification_channel_description">Pokaż powiadomienie gdy coś pójdzie źle</string> 6 <string name="notice_notification_channel_description">Pokaż powiadomienie gdy coś pójdzie źle</string>
10 <string name="notification_permission_not_granted">Nie zezwolono na powiadomienia!</string> 7 <string name="notification_permission_not_granted">Nie zezwolono na powiadomienia!</string>
diff --git a/src/android/app/src/main/res/values-pt-rBR/strings.xml b/src/android/app/src/main/res/values-pt-rBR/strings.xml
index a87eb11e4..130252590 100755
--- a/src/android/app/src/main/res/values-pt-rBR/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rBR/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Este software executa jogos do console Nintendo Switch. Não estão inclusos nem jogos ou chaves.&lt;br /&gt;&lt;br /&gt;Antes de começar, por favor localize o arquivo <![CDATA[<b> prod.keys </b>]]> no armazenamento de seu dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saiba mais</a>]]></string> 4 <string name="app_disclaimer">Este software executa jogos do console Nintendo Switch. Não estão inclusos nem jogos ou chaves.&lt;br /&gt;&lt;br /&gt;Antes de começar, por favor localize o arquivo <![CDATA[<b> prod.keys </b>]]> no armazenamento de seu dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saiba mais</a>]]></string>
5 <string name="emulation_notification_channel_name">A emulação está Ativa</string>
6 <string name="emulation_notification_channel_description">Mostra uma notificação permanente enquanto a emulação estiver em andamento.</string>
7 <string name="emulation_notification_running">O Yuzu está em execução </string>
8 <string name="notice_notification_channel_name">Notificações e erros</string> 5 <string name="notice_notification_channel_name">Notificações e erros</string>
9 <string name="notice_notification_channel_description">Mostra notificações quando algo dá errado.</string> 6 <string name="notice_notification_channel_description">Mostra notificações quando algo dá errado.</string>
10 <string name="notification_permission_not_granted">Acesso às notificações não concedido!</string> 7 <string name="notification_permission_not_granted">Acesso às notificações não concedido!</string>
diff --git a/src/android/app/src/main/res/values-pt-rPT/strings.xml b/src/android/app/src/main/res/values-pt-rPT/strings.xml
index 684a71616..0fdbae4f8 100755
--- a/src/android/app/src/main/res/values-pt-rPT/strings.xml
+++ b/src/android/app/src/main/res/values-pt-rPT/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Este software corre jogos para a consola Nintendo Switch. Não estão incluídas nem jogos ou chaves. &lt;br /&gt;&lt;br /&gt;Antes de começares, por favor localiza o ficheiro <![CDATA[1 prod.keys 1]]> no armazenamento do teu dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[2Learn more2]]></string> 4 <string name="app_disclaimer">Este software corre jogos para a consola Nintendo Switch. Não estão incluídas nem jogos ou chaves. &lt;br /&gt;&lt;br /&gt;Antes de começares, por favor localiza o ficheiro <![CDATA[1 prod.keys 1]]> no armazenamento do teu dispositivo.&lt;br /&gt;&lt;br /&gt;<![CDATA[2Learn more2]]></string>
5 <string name="emulation_notification_channel_name">Emulação está Ativa</string>
6 <string name="emulation_notification_channel_description">Mostra uma notificação permanente enquanto a emulação está a correr.</string>
7 <string name="emulation_notification_running">Yuzu está em execução </string>
8 <string name="notice_notification_channel_name">Notificações e erros</string> 5 <string name="notice_notification_channel_name">Notificações e erros</string>
9 <string name="notice_notification_channel_description">Mostra notificações quendo algo corre mal.</string> 6 <string name="notice_notification_channel_description">Mostra notificações quendo algo corre mal.</string>
10 <string name="notification_permission_not_granted">Permissões de notificação não permitidas </string> 7 <string name="notification_permission_not_granted">Permissões de notificação não permitidas </string>
diff --git a/src/android/app/src/main/res/values-ru/strings.xml b/src/android/app/src/main/res/values-ru/strings.xml
index 099b2c9eb..2dfd4a824 100755
--- a/src/android/app/src/main/res/values-ru/strings.xml
+++ b/src/android/app/src/main/res/values-ru/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Это программное обеспечение позволяет запускать игры для игровой консоли Nintendo Switch. Мы не предоставляем сами игры или ключи.&lt;br /&gt;&lt;br /&gt;Перед началом работы найдите файл <![CDATA[<b> prod.keys </b>]]> в хранилище устройства..&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Узнать больше</a>]]></string> 4 <string name="app_disclaimer">Это программное обеспечение позволяет запускать игры для игровой консоли Nintendo Switch. Мы не предоставляем сами игры или ключи.&lt;br /&gt;&lt;br /&gt;Перед началом работы найдите файл <![CDATA[<b> prod.keys </b>]]> в хранилище устройства..&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Узнать больше</a>]]></string>
5 <string name="emulation_notification_channel_name">Эмуляция активна</string>
6 <string name="emulation_notification_channel_description">Показывает постоянное уведомление, когда запущена эмуляция.</string>
7 <string name="emulation_notification_running">yuzu запущен</string>
8 <string name="notice_notification_channel_name">Уведомления и ошибки</string> 5 <string name="notice_notification_channel_name">Уведомления и ошибки</string>
9 <string name="notice_notification_channel_description">Показывать уведомления, когда что-то пошло не так</string> 6 <string name="notice_notification_channel_description">Показывать уведомления, когда что-то пошло не так</string>
10 <string name="notification_permission_not_granted">Вы не предоставили разрешение на уведомления!</string> 7 <string name="notification_permission_not_granted">Вы не предоставили разрешение на уведомления!</string>
diff --git a/src/android/app/src/main/res/values-uk/strings.xml b/src/android/app/src/main/res/values-uk/strings.xml
index 361f0b726..9a2804a93 100755
--- a/src/android/app/src/main/res/values-uk/strings.xml
+++ b/src/android/app/src/main/res/values-uk/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Це програмне забезпечення дозволяє запускати ігри для ігрової консолі Nintendo Switch. Ми не надаємо самі ігри або ключі.&lt;br /&gt;&lt;br /&gt;Перед початком роботи знайдіть ваш файл <![CDATA[<b> prod.keys </b>]]> у сховищі пристрою.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Дізнатися більше</a>]]></string> 4 <string name="app_disclaimer">Це програмне забезпечення дозволяє запускати ігри для ігрової консолі Nintendo Switch. Ми не надаємо самі ігри або ключі.&lt;br /&gt;&lt;br /&gt;Перед початком роботи знайдіть ваш файл <![CDATA[<b> prod.keys </b>]]> у сховищі пристрою.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Дізнатися більше</a>]]></string>
5 <string name="emulation_notification_channel_name">Емуляція активна</string>
6 <string name="emulation_notification_channel_description">Показує постійне сповіщення, коли запущено емуляцію.</string>
7 <string name="emulation_notification_running">yuzu запущено</string>
8 <string name="notice_notification_channel_name">Сповіщення та помилки</string> 5 <string name="notice_notification_channel_name">Сповіщення та помилки</string>
9 <string name="notice_notification_channel_description">Показувати сповіщення, коли щось пішло не так</string> 6 <string name="notice_notification_channel_description">Показувати сповіщення, коли щось пішло не так</string>
10 <string name="notification_permission_not_granted">Ви не надали дозвіл сповіщень!</string> 7 <string name="notification_permission_not_granted">Ви не надали дозвіл сповіщень!</string>
diff --git a/src/android/app/src/main/res/values-vi/strings.xml b/src/android/app/src/main/res/values-vi/strings.xml
index 0a722f329..dc06610c7 100755
--- a/src/android/app/src/main/res/values-vi/strings.xml
+++ b/src/android/app/src/main/res/values-vi/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">Phần mềm này sẽ chạy các game cho máy chơi game Nintendo Switch. Không có title games hoặc keys được bao gồm.&lt;br /&gt;&lt;br /&gt;Trước khi bạn bắt đầu, hãy tìm tập tin <![CDATA[<b> prod.keys </b>]]> trên bộ nhớ thiết bị của bạn.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tìm hiểu thêm</a>]]></string> 4 <string name="app_disclaimer">Phần mềm này sẽ chạy các game cho máy chơi game Nintendo Switch. Không có title games hoặc keys được bao gồm.&lt;br /&gt;&lt;br /&gt;Trước khi bạn bắt đầu, hãy tìm tập tin <![CDATA[<b> prod.keys </b>]]> trên bộ nhớ thiết bị của bạn.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tìm hiểu thêm</a>]]></string>
5 <string name="emulation_notification_channel_name">Giả lập đang chạy</string>
6 <string name="emulation_notification_channel_description">Hiển thị thông báo liên tục khi giả lập đang chạy.</string>
7 <string name="emulation_notification_running">yuzu đang chạy</string>
8 <string name="notice_notification_channel_name">Thông báo và lỗi</string> 5 <string name="notice_notification_channel_name">Thông báo và lỗi</string>
9 <string name="notice_notification_channel_description">Hiển thị thông báo khi có sự cố xảy ra.</string> 6 <string name="notice_notification_channel_description">Hiển thị thông báo khi có sự cố xảy ra.</string>
10 <string name="notification_permission_not_granted">Ứng dụng không được cấp quyền thông báo!</string> 7 <string name="notification_permission_not_granted">Ứng dụng không được cấp quyền thông báo!</string>
diff --git a/src/android/app/src/main/res/values-zh-rCN/strings.xml b/src/android/app/src/main/res/values-zh-rCN/strings.xml
index b840591a4..6acf6f391 100755
--- a/src/android/app/src/main/res/values-zh-rCN/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rCN/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">此软件可以运行 Nintendo Switch 游戏,但不包含任何游戏和密钥文件。&lt;br /&gt;&lt;br /&gt;在开始前,请找到放置于设备存储中的 <![CDATA[<b> prod.keys </b>]]> 文件。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">了解更多</a>]]></string> 4 <string name="app_disclaimer">此软件可以运行 Nintendo Switch 游戏,但不包含任何游戏和密钥文件。&lt;br /&gt;&lt;br /&gt;在开始前,请找到放置于设备存储中的 <![CDATA[<b> prod.keys </b>]]> 文件。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">了解更多</a>]]></string>
5 <string name="emulation_notification_channel_name">正在进行模拟</string>
6 <string name="emulation_notification_channel_description">在模拟运行时显示持久通知。</string>
7 <string name="emulation_notification_running">yuzu 正在运行</string>
8 <string name="notice_notification_channel_name">通知及错误提醒</string> 5 <string name="notice_notification_channel_name">通知及错误提醒</string>
9 <string name="notice_notification_channel_description">当发生错误时显示通知。</string> 6 <string name="notice_notification_channel_description">当发生错误时显示通知。</string>
10 <string name="notification_permission_not_granted">未授予通知权限!</string> 7 <string name="notification_permission_not_granted">未授予通知权限!</string>
diff --git a/src/android/app/src/main/res/values-zh-rTW/strings.xml b/src/android/app/src/main/res/values-zh-rTW/strings.xml
index d39255714..411fc5947 100755
--- a/src/android/app/src/main/res/values-zh-rTW/strings.xml
+++ b/src/android/app/src/main/res/values-zh-rTW/strings.xml
@@ -2,9 +2,6 @@
2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> 2<resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation">
3 3
4 <string name="app_disclaimer">此軟體可以執行 Nintendo Switch 主機遊戲,但不包含任何遊戲和金鑰。&lt;br /&gt;&lt;br /&gt;在您開始前,請找到放置於您的裝置儲存空間的 <![CDATA[<b> prod.keys </b>]]> 檔案。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">深入瞭解</a>]]></string> 4 <string name="app_disclaimer">此軟體可以執行 Nintendo Switch 主機遊戲,但不包含任何遊戲和金鑰。&lt;br /&gt;&lt;br /&gt;在您開始前,請找到放置於您的裝置儲存空間的 <![CDATA[<b> prod.keys </b>]]> 檔案。&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">深入瞭解</a>]]></string>
5 <string name="emulation_notification_channel_name">模擬進行中</string>
6 <string name="emulation_notification_channel_description">在模擬執行時顯示持續通知。</string>
7 <string name="emulation_notification_running">yuzu 正在執行</string>
8 <string name="notice_notification_channel_name">通知和錯誤</string> 5 <string name="notice_notification_channel_name">通知和錯誤</string>
9 <string name="notice_notification_channel_description">發生錯誤時顯示通知。</string> 6 <string name="notice_notification_channel_description">發生錯誤時顯示通知。</string>
10 <string name="notification_permission_not_granted">未授予通知權限!</string> 7 <string name="notification_permission_not_granted">未授予通知權限!</string>
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml
index 3cd1586fd..489e00107 100755
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@@ -4,10 +4,6 @@
4 <!-- General application strings --> 4 <!-- General application strings -->
5 <string name="app_name" translatable="false">yuzu</string> 5 <string name="app_name" translatable="false">yuzu</string>
6 <string name="app_disclaimer">This software will run games for the Nintendo Switch game console. No game titles or keys are included.&lt;br /&gt;&lt;br /&gt;Before you begin, please locate your <![CDATA[<b> prod.keys </b>]]> file on your device storage.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href="https://yuzu-emu.org/help/quickstart">Learn more</a>]]></string> 6 <string name="app_disclaimer">This software will run games for the Nintendo Switch game console. No game titles or keys are included.&lt;br /&gt;&lt;br /&gt;Before you begin, please locate your <![CDATA[<b> prod.keys </b>]]> file on your device storage.&lt;br /&gt;&lt;br /&gt;<![CDATA[<a href="https://yuzu-emu.org/help/quickstart">Learn more</a>]]></string>
7 <string name="emulation_notification_channel_name">Emulation is Active</string>
8 <string name="emulation_notification_channel_id" translatable="false">emulationIsActive</string>
9 <string name="emulation_notification_channel_description">Shows a persistent notification when emulation is running.</string>
10 <string name="emulation_notification_running">yuzu is running</string>
11 <string name="notice_notification_channel_name">Notices and errors</string> 7 <string name="notice_notification_channel_name">Notices and errors</string>
12 <string name="notice_notification_channel_id" translatable="false">noticesAndErrors</string> 8 <string name="notice_notification_channel_id" translatable="false">noticesAndErrors</string>
13 <string name="notice_notification_channel_description">Shows notifications when something goes wrong.</string> 9 <string name="notice_notification_channel_description">Shows notifications when something goes wrong.</string>
@@ -380,6 +376,7 @@
380 <string name="emulation_exit">Exit emulation</string> 376 <string name="emulation_exit">Exit emulation</string>
381 <string name="emulation_done">Done</string> 377 <string name="emulation_done">Done</string>
382 <string name="emulation_fps_counter">FPS counter</string> 378 <string name="emulation_fps_counter">FPS counter</string>
379 <string name="emulation_thermal_indicator">Thermal indicator</string>
383 <string name="emulation_toggle_controls">Toggle controls</string> 380 <string name="emulation_toggle_controls">Toggle controls</string>
384 <string name="emulation_rel_stick_center">Relative stick center</string> 381 <string name="emulation_rel_stick_center">Relative stick center</string>
385 <string name="emulation_dpad_slide">D-pad slide</string> 382 <string name="emulation_dpad_slide">D-pad slide</string>
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 429dd3e26..1cede53b6 100755
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -182,9 +182,15 @@ endif()
182 182
183if(ANDROID) 183if(ANDROID)
184 target_sources(common 184 target_sources(common
185 PRIVATE 185 PUBLIC
186 fs/fs_android.cpp 186 fs/fs_android.cpp
187 fs/fs_android.h 187 fs/fs_android.h
188 android/android_common.cpp
189 android/android_common.h
190 android/id_cache.cpp
191 android/id_cache.h
192 android/applets/software_keyboard.cpp
193 android/applets/software_keyboard.h
188 ) 194 )
189endif() 195endif()
190 196
diff --git a/src/common/android/android_common.cpp b/src/common/android/android_common.cpp
new file mode 100755
index 000000000..e79005658
--- /dev/null
+++ b/src/common/android/android_common.cpp
@@ -0,0 +1,65 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include "android_common.h"
5
6#include <string>
7#include <string_view>
8
9#include <jni.h>
10
11#include "common/android/id_cache.h"
12#include "common/string_util.h"
13
14namespace Common::Android {
15
16std::string GetJString(JNIEnv* env, jstring jstr) {
17 if (!jstr) {
18 return {};
19 }
20
21 const jchar* jchars = env->GetStringChars(jstr, nullptr);
22 const jsize length = env->GetStringLength(jstr);
23 const std::u16string_view string_view(reinterpret_cast<const char16_t*>(jchars),
24 static_cast<u32>(length));
25 const std::string converted_string = Common::UTF16ToUTF8(string_view);
26 env->ReleaseStringChars(jstr, jchars);
27
28 return converted_string;
29}
30
31jstring ToJString(JNIEnv* env, std::string_view str) {
32 const std::u16string converted_string = Common::UTF8ToUTF16(str);
33 return env->NewString(reinterpret_cast<const jchar*>(converted_string.data()),
34 static_cast<jint>(converted_string.size()));
35}
36
37jstring ToJString(JNIEnv* env, std::u16string_view str) {
38 return ToJString(env, Common::UTF16ToUTF8(str));
39}
40
41double GetJDouble(JNIEnv* env, jobject jdouble) {
42 return env->GetDoubleField(jdouble, GetDoubleValueField());
43}
44
45jobject ToJDouble(JNIEnv* env, double value) {
46 return env->NewObject(GetDoubleClass(), GetDoubleConstructor(), value);
47}
48
49s32 GetJInteger(JNIEnv* env, jobject jinteger) {
50 return env->GetIntField(jinteger, GetIntegerValueField());
51}
52
53jobject ToJInteger(JNIEnv* env, s32 value) {
54 return env->NewObject(GetIntegerClass(), GetIntegerConstructor(), value);
55}
56
57bool GetJBoolean(JNIEnv* env, jobject jboolean) {
58 return env->GetBooleanField(jboolean, GetBooleanValueField());
59}
60
61jobject ToJBoolean(JNIEnv* env, bool value) {
62 return env->NewObject(GetBooleanClass(), GetBooleanConstructor(), value);
63}
64
65} // namespace Common::Android
diff --git a/src/common/android/android_common.h b/src/common/android/android_common.h
new file mode 100755
index 000000000..d0ccb4ec2
--- /dev/null
+++ b/src/common/android/android_common.h
@@ -0,0 +1,26 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#pragma once
5
6#include <string>
7
8#include <jni.h>
9#include "common/common_types.h"
10
11namespace Common::Android {
12
13std::string GetJString(JNIEnv* env, jstring jstr);
14jstring ToJString(JNIEnv* env, std::string_view str);
15jstring ToJString(JNIEnv* env, std::u16string_view str);
16
17double GetJDouble(JNIEnv* env, jobject jdouble);
18jobject ToJDouble(JNIEnv* env, double value);
19
20s32 GetJInteger(JNIEnv* env, jobject jinteger);
21jobject ToJInteger(JNIEnv* env, s32 value);
22
23bool GetJBoolean(JNIEnv* env, jobject jboolean);
24jobject ToJBoolean(JNIEnv* env, bool value);
25
26} // namespace Common::Android
diff --git a/src/common/android/applets/software_keyboard.cpp b/src/common/android/applets/software_keyboard.cpp
new file mode 100755
index 000000000..477e62b16
--- /dev/null
+++ b/src/common/android/applets/software_keyboard.cpp
@@ -0,0 +1,277 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include <map>
5#include <thread>
6
7#include <jni.h>
8
9#include "common/android/android_common.h"
10#include "common/android/applets/software_keyboard.h"
11#include "common/android/id_cache.h"
12#include "common/logging/log.h"
13#include "common/string_util.h"
14#include "core/core.h"
15
16static jclass s_software_keyboard_class;
17static jclass s_keyboard_config_class;
18static jclass s_keyboard_data_class;
19static jmethodID s_swkbd_execute_normal;
20static jmethodID s_swkbd_execute_inline;
21
22namespace Common::Android::SoftwareKeyboard {
23
24static jobject ToJKeyboardParams(const Core::Frontend::KeyboardInitializeParameters& config) {
25 JNIEnv* env = GetEnvForThread();
26 jobject object = env->AllocObject(s_keyboard_config_class);
27
28 env->SetObjectField(object,
29 env->GetFieldID(s_keyboard_config_class, "ok_text", "Ljava/lang/String;"),
30 ToJString(env, config.ok_text));
31 env->SetObjectField(
32 object, env->GetFieldID(s_keyboard_config_class, "header_text", "Ljava/lang/String;"),
33 ToJString(env, config.header_text));
34 env->SetObjectField(object,
35 env->GetFieldID(s_keyboard_config_class, "sub_text", "Ljava/lang/String;"),
36 ToJString(env, config.sub_text));
37 env->SetObjectField(
38 object, env->GetFieldID(s_keyboard_config_class, "guide_text", "Ljava/lang/String;"),
39 ToJString(env, config.guide_text));
40 env->SetObjectField(
41 object, env->GetFieldID(s_keyboard_config_class, "initial_text", "Ljava/lang/String;"),
42 ToJString(env, config.initial_text));
43 env->SetShortField(object,
44 env->GetFieldID(s_keyboard_config_class, "left_optional_symbol_key", "S"),
45 static_cast<jshort>(config.left_optional_symbol_key));
46 env->SetShortField(object,
47 env->GetFieldID(s_keyboard_config_class, "right_optional_symbol_key", "S"),
48 static_cast<jshort>(config.right_optional_symbol_key));
49 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "max_text_length", "I"),
50 static_cast<jint>(config.max_text_length));
51 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "min_text_length", "I"),
52 static_cast<jint>(config.min_text_length));
53 env->SetIntField(object,
54 env->GetFieldID(s_keyboard_config_class, "initial_cursor_position", "I"),
55 static_cast<jint>(config.initial_cursor_position));
56 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "type", "I"),
57 static_cast<jint>(config.type));
58 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "password_mode", "I"),
59 static_cast<jint>(config.password_mode));
60 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "text_draw_type", "I"),
61 static_cast<jint>(config.text_draw_type));
62 env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "key_disable_flags", "I"),
63 static_cast<jint>(config.key_disable_flags.raw));
64 env->SetBooleanField(object,
65 env->GetFieldID(s_keyboard_config_class, "use_blur_background", "Z"),
66 static_cast<jboolean>(config.use_blur_background));
67 env->SetBooleanField(object,
68 env->GetFieldID(s_keyboard_config_class, "enable_backspace_button", "Z"),
69 static_cast<jboolean>(config.enable_backspace_button));
70 env->SetBooleanField(object,
71 env->GetFieldID(s_keyboard_config_class, "enable_return_button", "Z"),
72 static_cast<jboolean>(config.enable_return_button));
73 env->SetBooleanField(object,
74 env->GetFieldID(s_keyboard_config_class, "disable_cancel_button", "Z"),
75 static_cast<jboolean>(config.disable_cancel_button));
76
77 return object;
78}
79
80AndroidKeyboard::ResultData AndroidKeyboard::ResultData::CreateFromFrontend(jobject object) {
81 JNIEnv* env = GetEnvForThread();
82 const jstring string = reinterpret_cast<jstring>(env->GetObjectField(
83 object, env->GetFieldID(s_keyboard_data_class, "text", "Ljava/lang/String;")));
84 return ResultData{GetJString(env, string),
85 static_cast<Service::AM::Frontend::SwkbdResult>(env->GetIntField(
86 object, env->GetFieldID(s_keyboard_data_class, "result", "I")))};
87}
88
89AndroidKeyboard::~AndroidKeyboard() = default;
90
91void AndroidKeyboard::InitializeKeyboard(
92 bool is_inline, Core::Frontend::KeyboardInitializeParameters initialize_parameters,
93 SubmitNormalCallback submit_normal_callback_, SubmitInlineCallback submit_inline_callback_) {
94 if (is_inline) {
95 LOG_WARNING(
96 Frontend,
97 "(STUBBED) called, backend requested to initialize the inline software keyboard.");
98
99 submit_inline_callback = std::move(submit_inline_callback_);
100 } else {
101 LOG_WARNING(
102 Frontend,
103 "(STUBBED) called, backend requested to initialize the normal software keyboard.");
104
105 submit_normal_callback = std::move(submit_normal_callback_);
106 }
107
108 parameters = std::move(initialize_parameters);
109
110 LOG_INFO(Frontend,
111 "\nKeyboardInitializeParameters:"
112 "\nok_text={}"
113 "\nheader_text={}"
114 "\nsub_text={}"
115 "\nguide_text={}"
116 "\ninitial_text={}"
117 "\nmax_text_length={}"
118 "\nmin_text_length={}"
119 "\ninitial_cursor_position={}"
120 "\ntype={}"
121 "\npassword_mode={}"
122 "\ntext_draw_type={}"
123 "\nkey_disable_flags={}"
124 "\nuse_blur_background={}"
125 "\nenable_backspace_button={}"
126 "\nenable_return_button={}"
127 "\ndisable_cancel_button={}",
128 Common::UTF16ToUTF8(parameters.ok_text), Common::UTF16ToUTF8(parameters.header_text),
129 Common::UTF16ToUTF8(parameters.sub_text), Common::UTF16ToUTF8(parameters.guide_text),
130 Common::UTF16ToUTF8(parameters.initial_text), parameters.max_text_length,
131 parameters.min_text_length, parameters.initial_cursor_position, parameters.type,
132 parameters.password_mode, parameters.text_draw_type, parameters.key_disable_flags.raw,
133 parameters.use_blur_background, parameters.enable_backspace_button,
134 parameters.enable_return_button, parameters.disable_cancel_button);
135}
136
137void AndroidKeyboard::ShowNormalKeyboard() const {
138 LOG_DEBUG(Frontend, "called, backend requested to show the normal software keyboard.");
139
140 ResultData data{};
141
142 // Pivot to a new thread, as we cannot call GetEnvForThread() from a Fiber.
143 std::thread([&] {
144 data = ResultData::CreateFromFrontend(GetEnvForThread()->CallStaticObjectMethod(
145 s_software_keyboard_class, s_swkbd_execute_normal, ToJKeyboardParams(parameters)));
146 }).join();
147
148 SubmitNormalText(data);
149}
150
151void AndroidKeyboard::ShowTextCheckDialog(
152 Service::AM::Frontend::SwkbdTextCheckResult text_check_result,
153 std::u16string text_check_message) const {
154 LOG_WARNING(Frontend, "(STUBBED) called, backend requested to show the text check dialog.");
155}
156
157void AndroidKeyboard::ShowInlineKeyboard(
158 Core::Frontend::InlineAppearParameters appear_parameters) const {
159 LOG_WARNING(Frontend,
160 "(STUBBED) called, backend requested to show the inline software keyboard.");
161
162 LOG_INFO(Frontend,
163 "\nInlineAppearParameters:"
164 "\nmax_text_length={}"
165 "\nmin_text_length={}"
166 "\nkey_top_scale_x={}"
167 "\nkey_top_scale_y={}"
168 "\nkey_top_translate_x={}"
169 "\nkey_top_translate_y={}"
170 "\ntype={}"
171 "\nkey_disable_flags={}"
172 "\nkey_top_as_floating={}"
173 "\nenable_backspace_button={}"
174 "\nenable_return_button={}"
175 "\ndisable_cancel_button={}",
176 appear_parameters.max_text_length, appear_parameters.min_text_length,
177 appear_parameters.key_top_scale_x, appear_parameters.key_top_scale_y,
178 appear_parameters.key_top_translate_x, appear_parameters.key_top_translate_y,
179 appear_parameters.type, appear_parameters.key_disable_flags.raw,
180 appear_parameters.key_top_as_floating, appear_parameters.enable_backspace_button,
181 appear_parameters.enable_return_button, appear_parameters.disable_cancel_button);
182
183 // Pivot to a new thread, as we cannot call GetEnvForThread() from a Fiber.
184 m_is_inline_active = true;
185 std::thread([&] {
186 GetEnvForThread()->CallStaticVoidMethod(s_software_keyboard_class, s_swkbd_execute_inline,
187 ToJKeyboardParams(parameters));
188 }).join();
189}
190
191void AndroidKeyboard::HideInlineKeyboard() const {
192 LOG_WARNING(Frontend,
193 "(STUBBED) called, backend requested to hide the inline software keyboard.");
194}
195
196void AndroidKeyboard::InlineTextChanged(
197 Core::Frontend::InlineTextParameters text_parameters) const {
198 LOG_WARNING(Frontend,
199 "(STUBBED) called, backend requested to change the inline keyboard text.");
200
201 LOG_INFO(Frontend,
202 "\nInlineTextParameters:"
203 "\ninput_text={}"
204 "\ncursor_position={}",
205 Common::UTF16ToUTF8(text_parameters.input_text), text_parameters.cursor_position);
206
207 submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString,
208 text_parameters.input_text, text_parameters.cursor_position);
209}
210
211void AndroidKeyboard::ExitKeyboard() const {
212 LOG_WARNING(Frontend, "(STUBBED) called, backend requested to exit the software keyboard.");
213}
214
215void AndroidKeyboard::SubmitInlineKeyboardText(std::u16string submitted_text) {
216 if (!m_is_inline_active) {
217 return;
218 }
219
220 m_current_text += submitted_text;
221
222 submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString, m_current_text,
223 static_cast<int>(m_current_text.size()));
224}
225
226void AndroidKeyboard::SubmitInlineKeyboardInput(int key_code) {
227 static constexpr int KEYCODE_BACK = 4;
228 static constexpr int KEYCODE_ENTER = 66;
229 static constexpr int KEYCODE_DEL = 67;
230
231 if (!m_is_inline_active) {
232 return;
233 }
234
235 switch (key_code) {
236 case KEYCODE_BACK:
237 case KEYCODE_ENTER:
238 m_is_inline_active = false;
239 submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::DecidedEnter, m_current_text,
240 static_cast<s32>(m_current_text.size()));
241 break;
242 case KEYCODE_DEL:
243 m_current_text.pop_back();
244 submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString, m_current_text,
245 static_cast<int>(m_current_text.size()));
246 break;
247 }
248}
249
250void AndroidKeyboard::SubmitNormalText(const ResultData& data) const {
251 submit_normal_callback(data.result, Common::UTF8ToUTF16(data.text), true);
252}
253
254void InitJNI(JNIEnv* env) {
255 s_software_keyboard_class = reinterpret_cast<jclass>(
256 env->NewGlobalRef(env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard")));
257 s_keyboard_config_class = reinterpret_cast<jclass>(env->NewGlobalRef(
258 env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig")));
259 s_keyboard_data_class = reinterpret_cast<jclass>(env->NewGlobalRef(
260 env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardData")));
261
262 s_swkbd_execute_normal = env->GetStaticMethodID(
263 s_software_keyboard_class, "executeNormal",
264 "(Lorg/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig;)Lorg/yuzu/yuzu_emu/"
265 "applets/keyboard/SoftwareKeyboard$KeyboardData;");
266 s_swkbd_execute_inline = env->GetStaticMethodID(
267 s_software_keyboard_class, "executeInline",
268 "(Lorg/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig;)V");
269}
270
271void CleanupJNI(JNIEnv* env) {
272 env->DeleteGlobalRef(s_software_keyboard_class);
273 env->DeleteGlobalRef(s_keyboard_config_class);
274 env->DeleteGlobalRef(s_keyboard_data_class);
275}
276
277} // namespace Common::Android::SoftwareKeyboard
diff --git a/src/common/android/applets/software_keyboard.h b/src/common/android/applets/software_keyboard.h
new file mode 100755
index 000000000..9fd09d27c
--- /dev/null
+++ b/src/common/android/applets/software_keyboard.h
@@ -0,0 +1,78 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#pragma once
5
6#include <jni.h>
7
8#include "core/frontend/applets/software_keyboard.h"
9
10namespace Common::Android::SoftwareKeyboard {
11
12class AndroidKeyboard final : public Core::Frontend::SoftwareKeyboardApplet {
13public:
14 ~AndroidKeyboard() override;
15
16 void Close() const override {
17 ExitKeyboard();
18 }
19
20 void InitializeKeyboard(bool is_inline,
21 Core::Frontend::KeyboardInitializeParameters initialize_parameters,
22 SubmitNormalCallback submit_normal_callback_,
23 SubmitInlineCallback submit_inline_callback_) override;
24
25 void ShowNormalKeyboard() const override;
26
27 void ShowTextCheckDialog(Service::AM::Frontend::SwkbdTextCheckResult text_check_result,
28 std::u16string text_check_message) const override;
29
30 void ShowInlineKeyboard(
31 Core::Frontend::InlineAppearParameters appear_parameters) const override;
32
33 void HideInlineKeyboard() const override;
34
35 void InlineTextChanged(Core::Frontend::InlineTextParameters text_parameters) const override;
36
37 void ExitKeyboard() const override;
38
39 void SubmitInlineKeyboardText(std::u16string submitted_text);
40
41 void SubmitInlineKeyboardInput(int key_code);
42
43private:
44 struct ResultData {
45 static ResultData CreateFromFrontend(jobject object);
46
47 std::string text;
48 Service::AM::Frontend::SwkbdResult result{};
49 };
50
51 void SubmitNormalText(const ResultData& result) const;
52
53 Core::Frontend::KeyboardInitializeParameters parameters{};
54
55 mutable SubmitNormalCallback submit_normal_callback;
56 mutable SubmitInlineCallback submit_inline_callback;
57
58private:
59 mutable bool m_is_inline_active{};
60 std::u16string m_current_text;
61};
62
63// Should be called in JNI_Load
64void InitJNI(JNIEnv* env);
65
66// Should be called in JNI_Unload
67void CleanupJNI(JNIEnv* env);
68
69} // namespace Common::Android::SoftwareKeyboard
70
71// Native function calls
72extern "C" {
73JNIEXPORT jobject JNICALL Java_org_citra_citra_1emu_applets_SoftwareKeyboard_ValidateFilters(
74 JNIEnv* env, jclass clazz, jstring text);
75
76JNIEXPORT jobject JNICALL Java_org_citra_citra_1emu_applets_SoftwareKeyboard_ValidateInput(
77 JNIEnv* env, jclass clazz, jstring text);
78}
diff --git a/src/common/android/id_cache.cpp b/src/common/android/id_cache.cpp
new file mode 100755
index 000000000..f39262db9
--- /dev/null
+++ b/src/common/android/id_cache.cpp
@@ -0,0 +1,428 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include <jni.h>
5
6#include "applets/software_keyboard.h"
7#include "common/android/id_cache.h"
8#include "common/assert.h"
9#include "common/fs/fs_android.h"
10#include "video_core/rasterizer_interface.h"
11
12static JavaVM* s_java_vm;
13static jclass s_native_library_class;
14static jclass s_disk_cache_progress_class;
15static jclass s_load_callback_stage_class;
16static jclass s_game_dir_class;
17static jmethodID s_game_dir_constructor;
18static jmethodID s_exit_emulation_activity;
19static jmethodID s_disk_cache_load_progress;
20static jmethodID s_on_emulation_started;
21static jmethodID s_on_emulation_stopped;
22static jmethodID s_on_program_changed;
23
24static jclass s_game_class;
25static jmethodID s_game_constructor;
26static jfieldID s_game_title_field;
27static jfieldID s_game_path_field;
28static jfieldID s_game_program_id_field;
29static jfieldID s_game_developer_field;
30static jfieldID s_game_version_field;
31static jfieldID s_game_is_homebrew_field;
32
33static jclass s_string_class;
34static jclass s_pair_class;
35static jmethodID s_pair_constructor;
36static jfieldID s_pair_first_field;
37static jfieldID s_pair_second_field;
38
39static jclass s_overlay_control_data_class;
40static jmethodID s_overlay_control_data_constructor;
41static jfieldID s_overlay_control_data_id_field;
42static jfieldID s_overlay_control_data_enabled_field;
43static jfieldID s_overlay_control_data_landscape_position_field;
44static jfieldID s_overlay_control_data_portrait_position_field;
45static jfieldID s_overlay_control_data_foldable_position_field;
46
47static jclass s_patch_class;
48static jmethodID s_patch_constructor;
49static jfieldID s_patch_enabled_field;
50static jfieldID s_patch_name_field;
51static jfieldID s_patch_version_field;
52static jfieldID s_patch_type_field;
53static jfieldID s_patch_program_id_field;
54static jfieldID s_patch_title_id_field;
55
56static jclass s_double_class;
57static jmethodID s_double_constructor;
58static jfieldID s_double_value_field;
59
60static jclass s_integer_class;
61static jmethodID s_integer_constructor;
62static jfieldID s_integer_value_field;
63
64static jclass s_boolean_class;
65static jmethodID s_boolean_constructor;
66static jfieldID s_boolean_value_field;
67
68static constexpr jint JNI_VERSION = JNI_VERSION_1_6;
69
70namespace Common::Android {
71
72JNIEnv* GetEnvForThread() {
73 thread_local static struct OwnedEnv {
74 OwnedEnv() {
75 status = s_java_vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6);
76 if (status == JNI_EDETACHED)
77 s_java_vm->AttachCurrentThread(&env, nullptr);
78 }
79
80 ~OwnedEnv() {
81 if (status == JNI_EDETACHED)
82 s_java_vm->DetachCurrentThread();
83 }
84
85 int status;
86 JNIEnv* env = nullptr;
87 } owned;
88 return owned.env;
89}
90
91jclass GetNativeLibraryClass() {
92 return s_native_library_class;
93}
94
95jclass GetDiskCacheProgressClass() {
96 return s_disk_cache_progress_class;
97}
98
99jclass GetDiskCacheLoadCallbackStageClass() {
100 return s_load_callback_stage_class;
101}
102
103jclass GetGameDirClass() {
104 return s_game_dir_class;
105}
106
107jmethodID GetGameDirConstructor() {
108 return s_game_dir_constructor;
109}
110
111jmethodID GetExitEmulationActivity() {
112 return s_exit_emulation_activity;
113}
114
115jmethodID GetDiskCacheLoadProgress() {
116 return s_disk_cache_load_progress;
117}
118
119jmethodID GetOnEmulationStarted() {
120 return s_on_emulation_started;
121}
122
123jmethodID GetOnEmulationStopped() {
124 return s_on_emulation_stopped;
125}
126
127jmethodID GetOnProgramChanged() {
128 return s_on_program_changed;
129}
130
131jclass GetGameClass() {
132 return s_game_class;
133}
134
135jmethodID GetGameConstructor() {
136 return s_game_constructor;
137}
138
139jfieldID GetGameTitleField() {
140 return s_game_title_field;
141}
142
143jfieldID GetGamePathField() {
144 return s_game_path_field;
145}
146
147jfieldID GetGameProgramIdField() {
148 return s_game_program_id_field;
149}
150
151jfieldID GetGameDeveloperField() {
152 return s_game_developer_field;
153}
154
155jfieldID GetGameVersionField() {
156 return s_game_version_field;
157}
158
159jfieldID GetGameIsHomebrewField() {
160 return s_game_is_homebrew_field;
161}
162
163jclass GetStringClass() {
164 return s_string_class;
165}
166
167jclass GetPairClass() {
168 return s_pair_class;
169}
170
171jmethodID GetPairConstructor() {
172 return s_pair_constructor;
173}
174
175jfieldID GetPairFirstField() {
176 return s_pair_first_field;
177}
178
179jfieldID GetPairSecondField() {
180 return s_pair_second_field;
181}
182
183jclass GetOverlayControlDataClass() {
184 return s_overlay_control_data_class;
185}
186
187jmethodID GetOverlayControlDataConstructor() {
188 return s_overlay_control_data_constructor;
189}
190
191jfieldID GetOverlayControlDataIdField() {
192 return s_overlay_control_data_id_field;
193}
194
195jfieldID GetOverlayControlDataEnabledField() {
196 return s_overlay_control_data_enabled_field;
197}
198
199jfieldID GetOverlayControlDataLandscapePositionField() {
200 return s_overlay_control_data_landscape_position_field;
201}
202
203jfieldID GetOverlayControlDataPortraitPositionField() {
204 return s_overlay_control_data_portrait_position_field;
205}
206
207jfieldID GetOverlayControlDataFoldablePositionField() {
208 return s_overlay_control_data_foldable_position_field;
209}
210
211jclass GetPatchClass() {
212 return s_patch_class;
213}
214
215jmethodID GetPatchConstructor() {
216 return s_patch_constructor;
217}
218
219jfieldID GetPatchEnabledField() {
220 return s_patch_enabled_field;
221}
222
223jfieldID GetPatchNameField() {
224 return s_patch_name_field;
225}
226
227jfieldID GetPatchVersionField() {
228 return s_patch_version_field;
229}
230
231jfieldID GetPatchTypeField() {
232 return s_patch_type_field;
233}
234
235jfieldID GetPatchProgramIdField() {
236 return s_patch_program_id_field;
237}
238
239jfieldID GetPatchTitleIdField() {
240 return s_patch_title_id_field;
241}
242
243jclass GetDoubleClass() {
244 return s_double_class;
245}
246
247jmethodID GetDoubleConstructor() {
248 return s_double_constructor;
249}
250
251jfieldID GetDoubleValueField() {
252 return s_double_value_field;
253}
254
255jclass GetIntegerClass() {
256 return s_integer_class;
257}
258
259jmethodID GetIntegerConstructor() {
260 return s_integer_constructor;
261}
262
263jfieldID GetIntegerValueField() {
264 return s_integer_value_field;
265}
266
267jclass GetBooleanClass() {
268 return s_boolean_class;
269}
270
271jmethodID GetBooleanConstructor() {
272 return s_boolean_constructor;
273}
274
275jfieldID GetBooleanValueField() {
276 return s_boolean_value_field;
277}
278
279#ifdef __cplusplus
280extern "C" {
281#endif
282
283jint JNI_OnLoad(JavaVM* vm, void* reserved) {
284 s_java_vm = vm;
285
286 JNIEnv* env;
287 if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK)
288 return JNI_ERR;
289
290 // Initialize Java classes
291 const jclass native_library_class = env->FindClass("org/yuzu/yuzu_emu/NativeLibrary");
292 s_native_library_class = reinterpret_cast<jclass>(env->NewGlobalRef(native_library_class));
293 s_disk_cache_progress_class = reinterpret_cast<jclass>(env->NewGlobalRef(
294 env->FindClass("org/yuzu/yuzu_emu/disk_shader_cache/DiskShaderCacheProgress")));
295 s_load_callback_stage_class = reinterpret_cast<jclass>(env->NewGlobalRef(env->FindClass(
296 "org/yuzu/yuzu_emu/disk_shader_cache/DiskShaderCacheProgress$LoadCallbackStage")));
297
298 const jclass game_dir_class = env->FindClass("org/yuzu/yuzu_emu/model/GameDir");
299 s_game_dir_class = reinterpret_cast<jclass>(env->NewGlobalRef(game_dir_class));
300 s_game_dir_constructor = env->GetMethodID(game_dir_class, "<init>", "(Ljava/lang/String;Z)V");
301 env->DeleteLocalRef(game_dir_class);
302
303 // Initialize methods
304 s_exit_emulation_activity =
305 env->GetStaticMethodID(s_native_library_class, "exitEmulationActivity", "(I)V");
306 s_disk_cache_load_progress =
307 env->GetStaticMethodID(s_disk_cache_progress_class, "loadProgress", "(III)V");
308 s_on_emulation_started =
309 env->GetStaticMethodID(s_native_library_class, "onEmulationStarted", "()V");
310 s_on_emulation_stopped =
311 env->GetStaticMethodID(s_native_library_class, "onEmulationStopped", "(I)V");
312 s_on_program_changed =
313 env->GetStaticMethodID(s_native_library_class, "onProgramChanged", "(I)V");
314
315 const jclass game_class = env->FindClass("org/yuzu/yuzu_emu/model/Game");
316 s_game_class = reinterpret_cast<jclass>(env->NewGlobalRef(game_class));
317 s_game_constructor = env->GetMethodID(game_class, "<init>",
318 "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
319 "String;Ljava/lang/String;Ljava/lang/String;Z)V");
320 s_game_title_field = env->GetFieldID(game_class, "title", "Ljava/lang/String;");
321 s_game_path_field = env->GetFieldID(game_class, "path", "Ljava/lang/String;");
322 s_game_program_id_field = env->GetFieldID(game_class, "programId", "Ljava/lang/String;");
323 s_game_developer_field = env->GetFieldID(game_class, "developer", "Ljava/lang/String;");
324 s_game_version_field = env->GetFieldID(game_class, "version", "Ljava/lang/String;");
325 s_game_is_homebrew_field = env->GetFieldID(game_class, "isHomebrew", "Z");
326 env->DeleteLocalRef(game_class);
327
328 const jclass string_class = env->FindClass("java/lang/String");
329 s_string_class = reinterpret_cast<jclass>(env->NewGlobalRef(string_class));
330 env->DeleteLocalRef(string_class);
331
332 const jclass pair_class = env->FindClass("kotlin/Pair");
333 s_pair_class = reinterpret_cast<jclass>(env->NewGlobalRef(pair_class));
334 s_pair_constructor =
335 env->GetMethodID(pair_class, "<init>", "(Ljava/lang/Object;Ljava/lang/Object;)V");
336 s_pair_first_field = env->GetFieldID(pair_class, "first", "Ljava/lang/Object;");
337 s_pair_second_field = env->GetFieldID(pair_class, "second", "Ljava/lang/Object;");
338 env->DeleteLocalRef(pair_class);
339
340 const jclass overlay_control_data_class =
341 env->FindClass("org/yuzu/yuzu_emu/overlay/model/OverlayControlData");
342 s_overlay_control_data_class =
343 reinterpret_cast<jclass>(env->NewGlobalRef(overlay_control_data_class));
344 s_overlay_control_data_constructor =
345 env->GetMethodID(overlay_control_data_class, "<init>",
346 "(Ljava/lang/String;ZLkotlin/Pair;Lkotlin/Pair;Lkotlin/Pair;)V");
347 s_overlay_control_data_id_field =
348 env->GetFieldID(overlay_control_data_class, "id", "Ljava/lang/String;");
349 s_overlay_control_data_enabled_field =
350 env->GetFieldID(overlay_control_data_class, "enabled", "Z");
351 s_overlay_control_data_landscape_position_field =
352 env->GetFieldID(overlay_control_data_class, "landscapePosition", "Lkotlin/Pair;");
353 s_overlay_control_data_portrait_position_field =
354 env->GetFieldID(overlay_control_data_class, "portraitPosition", "Lkotlin/Pair;");
355 s_overlay_control_data_foldable_position_field =
356 env->GetFieldID(overlay_control_data_class, "foldablePosition", "Lkotlin/Pair;");
357 env->DeleteLocalRef(overlay_control_data_class);
358
359 const jclass patch_class = env->FindClass("org/yuzu/yuzu_emu/model/Patch");
360 s_patch_class = reinterpret_cast<jclass>(env->NewGlobalRef(patch_class));
361 s_patch_constructor = env->GetMethodID(
362 patch_class, "<init>",
363 "(ZLjava/lang/String;Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;)V");
364 s_patch_enabled_field = env->GetFieldID(patch_class, "enabled", "Z");
365 s_patch_name_field = env->GetFieldID(patch_class, "name", "Ljava/lang/String;");
366 s_patch_version_field = env->GetFieldID(patch_class, "version", "Ljava/lang/String;");
367 s_patch_type_field = env->GetFieldID(patch_class, "type", "I");
368 s_patch_program_id_field = env->GetFieldID(patch_class, "programId", "Ljava/lang/String;");
369 s_patch_title_id_field = env->GetFieldID(patch_class, "titleId", "Ljava/lang/String;");
370 env->DeleteLocalRef(patch_class);
371
372 const jclass double_class = env->FindClass("java/lang/Double");
373 s_double_class = reinterpret_cast<jclass>(env->NewGlobalRef(double_class));
374 s_double_constructor = env->GetMethodID(double_class, "<init>", "(D)V");
375 s_double_value_field = env->GetFieldID(double_class, "value", "D");
376 env->DeleteLocalRef(double_class);
377
378 const jclass int_class = env->FindClass("java/lang/Integer");
379 s_integer_class = reinterpret_cast<jclass>(env->NewGlobalRef(int_class));
380 s_integer_constructor = env->GetMethodID(int_class, "<init>", "(I)V");
381 s_integer_value_field = env->GetFieldID(int_class, "value", "I");
382 env->DeleteLocalRef(int_class);
383
384 const jclass boolean_class = env->FindClass("java/lang/Boolean");
385 s_boolean_class = reinterpret_cast<jclass>(env->NewGlobalRef(boolean_class));
386 s_boolean_constructor = env->GetMethodID(boolean_class, "<init>", "(Z)V");
387 s_boolean_value_field = env->GetFieldID(boolean_class, "value", "Z");
388 env->DeleteLocalRef(boolean_class);
389
390 // Initialize Android Storage
391 Common::FS::Android::RegisterCallbacks(env, s_native_library_class);
392
393 // Initialize applets
394 Common::Android::SoftwareKeyboard::InitJNI(env);
395
396 return JNI_VERSION;
397}
398
399void JNI_OnUnload(JavaVM* vm, void* reserved) {
400 JNIEnv* env;
401 if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK) {
402 return;
403 }
404
405 // UnInitialize Android Storage
406 Common::FS::Android::UnRegisterCallbacks();
407 env->DeleteGlobalRef(s_native_library_class);
408 env->DeleteGlobalRef(s_disk_cache_progress_class);
409 env->DeleteGlobalRef(s_load_callback_stage_class);
410 env->DeleteGlobalRef(s_game_dir_class);
411 env->DeleteGlobalRef(s_game_class);
412 env->DeleteGlobalRef(s_string_class);
413 env->DeleteGlobalRef(s_pair_class);
414 env->DeleteGlobalRef(s_overlay_control_data_class);
415 env->DeleteGlobalRef(s_patch_class);
416 env->DeleteGlobalRef(s_double_class);
417 env->DeleteGlobalRef(s_integer_class);
418 env->DeleteGlobalRef(s_boolean_class);
419
420 // UnInitialize applets
421 SoftwareKeyboard::CleanupJNI(env);
422}
423
424#ifdef __cplusplus
425}
426#endif
427
428} // namespace Common::Android
diff --git a/src/common/android/id_cache.h b/src/common/android/id_cache.h
new file mode 100755
index 000000000..47802f96c
--- /dev/null
+++ b/src/common/android/id_cache.h
@@ -0,0 +1,88 @@
1// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4#pragma once
5
6#include <future>
7#include <jni.h>
8
9#include "video_core/rasterizer_interface.h"
10
11namespace Common::Android {
12
13JNIEnv* GetEnvForThread();
14
15/**
16 * Starts a new thread to run JNI. Intended to be used when you must run JNI from a fiber.
17 * @tparam T Typename of the return value for the work param
18 * @param work Lambda that runs JNI code. This function will take care of attaching this thread to
19 * the JVM
20 * @return The result from the work lambda param
21 */
22template <typename T = void>
23T RunJNIOnFiber(const std::function<T(JNIEnv*)>& work) {
24 std::future<T> j_result = std::async(std::launch::async, [&] {
25 auto env = GetEnvForThread();
26 return work(env);
27 });
28 return j_result.get();
29}
30
31jclass GetNativeLibraryClass();
32
33jclass GetDiskCacheProgressClass();
34jclass GetDiskCacheLoadCallbackStageClass();
35jclass GetGameDirClass();
36jmethodID GetGameDirConstructor();
37jmethodID GetDiskCacheLoadProgress();
38
39jmethodID GetExitEmulationActivity();
40jmethodID GetOnEmulationStarted();
41jmethodID GetOnEmulationStopped();
42jmethodID GetOnProgramChanged();
43
44jclass GetGameClass();
45jmethodID GetGameConstructor();
46jfieldID GetGameTitleField();
47jfieldID GetGamePathField();
48jfieldID GetGameProgramIdField();
49jfieldID GetGameDeveloperField();
50jfieldID GetGameVersionField();
51jfieldID GetGameIsHomebrewField();
52
53jclass GetStringClass();
54jclass GetPairClass();
55jmethodID GetPairConstructor();
56jfieldID GetPairFirstField();
57jfieldID GetPairSecondField();
58
59jclass GetOverlayControlDataClass();
60jmethodID GetOverlayControlDataConstructor();
61jfieldID GetOverlayControlDataIdField();
62jfieldID GetOverlayControlDataEnabledField();
63jfieldID GetOverlayControlDataLandscapePositionField();
64jfieldID GetOverlayControlDataPortraitPositionField();
65jfieldID GetOverlayControlDataFoldablePositionField();
66
67jclass GetPatchClass();
68jmethodID GetPatchConstructor();
69jfieldID GetPatchEnabledField();
70jfieldID GetPatchNameField();
71jfieldID GetPatchVersionField();
72jfieldID GetPatchTypeField();
73jfieldID GetPatchProgramIdField();
74jfieldID GetPatchTitleIdField();
75
76jclass GetDoubleClass();
77jmethodID GetDoubleConstructor();
78jfieldID GetDoubleValueField();
79
80jclass GetIntegerClass();
81jmethodID GetIntegerConstructor();
82jfieldID GetIntegerValueField();
83
84jclass GetBooleanClass();
85jmethodID GetBooleanConstructor();
86jfieldID GetBooleanValueField();
87
88} // namespace Common::Android
diff --git a/src/common/fs/fs_android.cpp b/src/common/fs/fs_android.cpp
index 1dd826a4a..9a8053222 100755
--- a/src/common/fs/fs_android.cpp
+++ b/src/common/fs/fs_android.cpp
@@ -1,63 +1,38 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project 1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later 2// SPDX-License-Identifier: GPL-2.0-or-later
3 3
4#include "common/android/android_common.h"
5#include "common/android/id_cache.h"
6#include "common/assert.h"
4#include "common/fs/fs_android.h" 7#include "common/fs/fs_android.h"
5#include "common/string_util.h" 8#include "common/string_util.h"
6 9
7namespace Common::FS::Android { 10namespace Common::FS::Android {
8 11
9JNIEnv* GetEnvForThread() {
10 thread_local static struct OwnedEnv {
11 OwnedEnv() {
12 status = g_jvm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6);
13 if (status == JNI_EDETACHED)
14 g_jvm->AttachCurrentThread(&env, nullptr);
15 }
16
17 ~OwnedEnv() {
18 if (status == JNI_EDETACHED)
19 g_jvm->DetachCurrentThread();
20 }
21
22 int status;
23 JNIEnv* env = nullptr;
24 } owned;
25 return owned.env;
26}
27
28void RegisterCallbacks(JNIEnv* env, jclass clazz) { 12void RegisterCallbacks(JNIEnv* env, jclass clazz) {
29 env->GetJavaVM(&g_jvm); 13 env->GetJavaVM(&g_jvm);
30 native_library = clazz; 14 native_library = clazz;
31 15
32#define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) \ 16 s_get_parent_directory = env->GetStaticMethodID(native_library, "getParentDirectory",
33 F(JMethodID, JMethodName, Signature) 17 "(Ljava/lang/String;)Ljava/lang/String;");
34#define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \ 18 s_get_filename = env->GetStaticMethodID(native_library, "getFilename",
35 F(JMethodID, JMethodName, Signature) 19 "(Ljava/lang/String;)Ljava/lang/String;");
36#define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) \ 20 s_get_size = env->GetStaticMethodID(native_library, "getSize", "(Ljava/lang/String;)J");
37 F(JMethodID, JMethodName, Signature) 21 s_is_directory = env->GetStaticMethodID(native_library, "isDirectory", "(Ljava/lang/String;)Z");
38#define F(JMethodID, JMethodName, Signature) \ 22 s_file_exists = env->GetStaticMethodID(native_library, "exists", "(Ljava/lang/String;)Z");
39 JMethodID = env->GetStaticMethodID(native_library, JMethodName, Signature); 23 s_open_content_uri = env->GetStaticMethodID(native_library, "openContentUri",
40 ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) 24 "(Ljava/lang/String;Ljava/lang/String;)I");
41 ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR)
42 ANDROID_STORAGE_FUNCTIONS(FS)
43#undef F
44#undef FS
45#undef FR
46#undef FH
47} 25}
48 26
49void UnRegisterCallbacks() { 27void UnRegisterCallbacks() {
50#define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(JMethodID) 28 s_get_parent_directory = nullptr;
51#define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) F(JMethodID) 29 s_get_filename = nullptr;
52#define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) F(JMethodID) 30
53#define F(JMethodID) JMethodID = nullptr; 31 s_get_size = nullptr;
54 ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) 32 s_is_directory = nullptr;
55 ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) 33 s_file_exists = nullptr;
56 ANDROID_STORAGE_FUNCTIONS(FS) 34
57#undef F 35 s_open_content_uri = nullptr;
58#undef FS
59#undef FR
60#undef FH
61} 36}
62 37
63bool IsContentUri(const std::string& path) { 38bool IsContentUri(const std::string& path) {
@@ -69,8 +44,8 @@ bool IsContentUri(const std::string& path) {
69 return path.find(prefix) == 0; 44 return path.find(prefix) == 0;
70} 45}
71 46
72int OpenContentUri(const std::string& filepath, OpenMode openmode) { 47s32 OpenContentUri(const std::string& filepath, OpenMode openmode) {
73 if (open_content_uri == nullptr) 48 if (s_open_content_uri == nullptr)
74 return -1; 49 return -1;
75 50
76 const char* mode = ""; 51 const char* mode = "";
@@ -82,50 +57,66 @@ int OpenContentUri(const std::string& filepath, OpenMode openmode) {
82 UNIMPLEMENTED(); 57 UNIMPLEMENTED();
83 return -1; 58 return -1;
84 } 59 }
85 auto env = GetEnvForThread(); 60 auto env = Common::Android::GetEnvForThread();
86 jstring j_filepath = env->NewStringUTF(filepath.c_str()); 61 jstring j_filepath = Common::Android::ToJString(env, filepath);
87 jstring j_mode = env->NewStringUTF(mode); 62 jstring j_mode = Common::Android::ToJString(env, mode);
88 return env->CallStaticIntMethod(native_library, open_content_uri, j_filepath, j_mode); 63 return env->CallStaticIntMethod(native_library, s_open_content_uri, j_filepath, j_mode);
64}
65
66u64 GetSize(const std::string& filepath) {
67 if (s_get_size == nullptr) {
68 return 0;
69 }
70 auto env = Common::Android::GetEnvForThread();
71 return static_cast<u64>(env->CallStaticLongMethod(
72 native_library, s_get_size,
73 Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath)));
74}
75
76bool IsDirectory(const std::string& filepath) {
77 if (s_is_directory == nullptr) {
78 return 0;
79 }
80 auto env = Common::Android::GetEnvForThread();
81 return env->CallStaticBooleanMethod(
82 native_library, s_is_directory,
83 Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath));
89} 84}
90 85
91#define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \ 86bool Exists(const std::string& filepath) {
92 F(FunctionName, ReturnValue, JMethodID, Caller) 87 if (s_file_exists == nullptr) {
93#define F(FunctionName, ReturnValue, JMethodID, Caller) \ 88 return 0;
94 ReturnValue FunctionName(const std::string& filepath) { \
95 if (JMethodID == nullptr) { \
96 return 0; \
97 } \
98 auto env = GetEnvForThread(); \
99 jstring j_filepath = env->NewStringUTF(filepath.c_str()); \
100 return env->Caller(native_library, JMethodID, j_filepath); \
101 } 89 }
102ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) 90 auto env = Common::Android::GetEnvForThread();
103#undef F 91 return env->CallStaticBooleanMethod(
104#undef FR 92 native_library, s_file_exists,
105 93 Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath));
106#define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) \ 94}
107 F(FunctionName, JMethodID, Caller) 95
108#define F(FunctionName, JMethodID, Caller) \ 96std::string GetParentDirectory(const std::string& filepath) {
109 std::string FunctionName(const std::string& filepath) { \ 97 if (s_get_parent_directory == nullptr) {
110 if (JMethodID == nullptr) { \ 98 return 0;
111 return 0; \
112 } \
113 auto env = GetEnvForThread(); \
114 jstring j_filepath = env->NewStringUTF(filepath.c_str()); \
115 jstring j_return = \
116 static_cast<jstring>(env->Caller(native_library, JMethodID, j_filepath)); \
117 if (!j_return) { \
118 return {}; \
119 } \
120 const jchar* jchars = env->GetStringChars(j_return, nullptr); \
121 const jsize length = env->GetStringLength(j_return); \
122 const std::u16string_view string_view(reinterpret_cast<const char16_t*>(jchars), length); \
123 const std::string converted_string = Common::UTF16ToUTF8(string_view); \
124 env->ReleaseStringChars(j_return, jchars); \
125 return converted_string; \
126 } 99 }
127ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) 100 auto env = Common::Android::GetEnvForThread();
128#undef F 101 jstring j_return = static_cast<jstring>(env->CallStaticObjectMethod(
129#undef FH 102 native_library, s_get_parent_directory, Common::Android::ToJString(env, filepath)));
103 if (!j_return) {
104 return {};
105 }
106 return Common::Android::GetJString(env, j_return);
107}
108
109std::string GetFilename(const std::string& filepath) {
110 if (s_get_filename == nullptr) {
111 return 0;
112 }
113 auto env = Common::Android::GetEnvForThread();
114 jstring j_return = static_cast<jstring>(env->CallStaticObjectMethod(
115 native_library, s_get_filename, Common::Android::ToJString(env, filepath)));
116 if (!j_return) {
117 return {};
118 }
119 return Common::Android::GetJString(env, j_return);
120}
130 121
131} // namespace Common::FS::Android 122} // namespace Common::FS::Android
diff --git a/src/common/fs/fs_android.h b/src/common/fs/fs_android.h
index 2c9234313..b33f4beb8 100755
--- a/src/common/fs/fs_android.h
+++ b/src/common/fs/fs_android.h
@@ -7,38 +7,17 @@
7#include <vector> 7#include <vector>
8#include <jni.h> 8#include <jni.h>
9 9
10#define ANDROID_STORAGE_FUNCTIONS(V) \
11 V(OpenContentUri, int, (const std::string& filepath, OpenMode openmode), open_content_uri, \
12 "openContentUri", "(Ljava/lang/String;Ljava/lang/String;)I")
13
14#define ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(V) \
15 V(GetSize, std::uint64_t, get_size, CallStaticLongMethod, "getSize", "(Ljava/lang/String;)J") \
16 V(IsDirectory, bool, is_directory, CallStaticBooleanMethod, "isDirectory", \
17 "(Ljava/lang/String;)Z") \
18 V(Exists, bool, file_exists, CallStaticBooleanMethod, "exists", "(Ljava/lang/String;)Z")
19
20#define ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(V) \
21 V(GetParentDirectory, get_parent_directory, CallStaticObjectMethod, "getParentDirectory", \
22 "(Ljava/lang/String;)Ljava/lang/String;") \
23 V(GetFilename, get_filename, CallStaticObjectMethod, "getFilename", \
24 "(Ljava/lang/String;)Ljava/lang/String;")
25
26namespace Common::FS::Android { 10namespace Common::FS::Android {
27 11
28static JavaVM* g_jvm = nullptr; 12static JavaVM* g_jvm = nullptr;
29static jclass native_library = nullptr; 13static jclass native_library = nullptr;
30 14
31#define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(JMethodID) 15static jmethodID s_get_parent_directory;
32#define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) F(JMethodID) 16static jmethodID s_get_filename;
33#define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) F(JMethodID) 17static jmethodID s_get_size;
34#define F(JMethodID) static jmethodID JMethodID = nullptr; 18static jmethodID s_is_directory;
35ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) 19static jmethodID s_file_exists;
36ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) 20static jmethodID s_open_content_uri;
37ANDROID_STORAGE_FUNCTIONS(FS)
38#undef F
39#undef FS
40#undef FR
41#undef FH
42 21
43enum class OpenMode { 22enum class OpenMode {
44 Read, 23 Read,
@@ -57,24 +36,11 @@ void UnRegisterCallbacks();
57 36
58bool IsContentUri(const std::string& path); 37bool IsContentUri(const std::string& path);
59 38
60#define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) \ 39int OpenContentUri(const std::string& filepath, OpenMode openmode);
61 F(FunctionName, Parameters, ReturnValue) 40std::uint64_t GetSize(const std::string& filepath);
62#define F(FunctionName, Parameters, ReturnValue) ReturnValue FunctionName Parameters; 41bool IsDirectory(const std::string& filepath);
63ANDROID_STORAGE_FUNCTIONS(FS) 42bool Exists(const std::string& filepath);
64#undef F 43std::string GetParentDirectory(const std::string& filepath);
65#undef FS 44std::string GetFilename(const std::string& filepath);
66
67#define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \
68 F(FunctionName, ReturnValue)
69#define F(FunctionName, ReturnValue) ReturnValue FunctionName(const std::string& filepath);
70ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR)
71#undef F
72#undef FR
73
74#define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(FunctionName)
75#define F(FunctionName) std::string FunctionName(const std::string& filepath);
76ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH)
77#undef F
78#undef FH
79 45
80} // namespace Common::FS::Android 46} // namespace Common::FS::Android
diff --git a/src/core/file_sys/content_archive.cpp b/src/core/file_sys/content_archive.cpp
index 7e543576e..33040d9c3 100755
--- a/src/core/file_sys/content_archive.cpp
+++ b/src/core/file_sys/content_archive.cpp
@@ -172,6 +172,10 @@ u32 NCA::GetSDKVersion() const {
172 return reader->GetSdkAddonVersion(); 172 return reader->GetSdkAddonVersion();
173} 173}
174 174
175u8 NCA::GetKeyGeneration() const {
176 return reader->GetKeyGeneration();
177}
178
175bool NCA::IsUpdate() const { 179bool NCA::IsUpdate() const {
176 return is_update; 180 return is_update;
177} 181}
diff --git a/src/core/file_sys/content_archive.h b/src/core/file_sys/content_archive.h
index 8cc82ccb8..1d02d1193 100755
--- a/src/core/file_sys/content_archive.h
+++ b/src/core/file_sys/content_archive.h
@@ -77,6 +77,7 @@ public:
77 u64 GetTitleId() const; 77 u64 GetTitleId() const;
78 RightsId GetRightsId() const; 78 RightsId GetRightsId() const;
79 u32 GetSDKVersion() const; 79 u32 GetSDKVersion() const;
80 u8 GetKeyGeneration() const;
80 bool IsUpdate() const; 81 bool IsUpdate() const;
81 82
82 VirtualFile GetRomFS() const; 83 VirtualFile GetRomFS() const;
diff --git a/src/core/hle/service/am/library_applet_creator.cpp b/src/core/hle/service/am/library_applet_creator.cpp
index c48ed29bc..3e2a1d9c1 100755
--- a/src/core/hle/service/am/library_applet_creator.cpp
+++ b/src/core/hle/service/am/library_applet_creator.cpp
@@ -102,8 +102,14 @@ std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(Core::System& system,
102 return {}; 102 return {};
103 } 103 }
104 104
105 // TODO: enable other versions of applets
106 enum : u8 {
107 Firmware1600 = 15,
108 Firmware1700 = 16,
109 };
110
105 auto process = std::make_unique<Process>(system); 111 auto process = std::make_unique<Process>(system);
106 if (!process->Initialize(program_id)) { 112 if (!process->Initialize(program_id, Firmware1600, Firmware1700)) {
107 // Couldn't initialize the guest process 113 // Couldn't initialize the guest process
108 return {}; 114 return {};
109 } 115 }
diff --git a/src/core/hle/service/am/process.cpp b/src/core/hle/service/am/process.cpp
index 16b685f86..992c50713 100755
--- a/src/core/hle/service/am/process.cpp
+++ b/src/core/hle/service/am/process.cpp
@@ -3,6 +3,7 @@
3 3
4#include "common/scope_exit.h" 4#include "common/scope_exit.h"
5 5
6#include "core/file_sys/content_archive.h"
6#include "core/file_sys/nca_metadata.h" 7#include "core/file_sys/nca_metadata.h"
7#include "core/file_sys/registered_cache.h" 8#include "core/file_sys/registered_cache.h"
8#include "core/hle/kernel/k_process.h" 9#include "core/hle/kernel/k_process.h"
@@ -20,7 +21,7 @@ Process::~Process() {
20 this->Finalize(); 21 this->Finalize();
21} 22}
22 23
23bool Process::Initialize(u64 program_id) { 24bool Process::Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation) {
24 // First, ensure we are not holding another process. 25 // First, ensure we are not holding another process.
25 this->Finalize(); 26 this->Finalize();
26 27
@@ -29,21 +30,33 @@ bool Process::Initialize(u64 program_id) {
29 30
30 // Attempt to load program NCA. 31 // Attempt to load program NCA.
31 const FileSys::RegisteredCache* bis_system{}; 32 const FileSys::RegisteredCache* bis_system{};
32 FileSys::VirtualFile nca{}; 33 FileSys::VirtualFile nca_raw{};
33 34
34 // Get the program NCA from built-in storage. 35 // Get the program NCA from built-in storage.
35 bis_system = fsc.GetSystemNANDContents(); 36 bis_system = fsc.GetSystemNANDContents();
36 if (bis_system) { 37 if (bis_system) {
37 nca = bis_system->GetEntryRaw(program_id, FileSys::ContentRecordType::Program); 38 nca_raw = bis_system->GetEntryRaw(program_id, FileSys::ContentRecordType::Program);
38 } 39 }
39 40
40 // Ensure we retrieved a program NCA. 41 // Ensure we retrieved a program NCA.
41 if (!nca) { 42 if (!nca_raw) {
42 return false; 43 return false;
43 } 44 }
44 45
46 // Ensure we have a suitable version.
47 if (minimum_key_generation > 0) {
48 FileSys::NCA nca(nca_raw);
49 if (nca.GetStatus() == Loader::ResultStatus::Success &&
50 (nca.GetKeyGeneration() < minimum_key_generation ||
51 nca.GetKeyGeneration() > maximum_key_generation)) {
52 LOG_WARNING(Service_LDR, "Skipping program {:016X} with generation {}", program_id,
53 nca.GetKeyGeneration());
54 return false;
55 }
56 }
57
45 // Get the appropriate loader to parse this NCA. 58 // Get the appropriate loader to parse this NCA.
46 auto app_loader = Loader::GetLoader(m_system, nca, program_id, 0); 59 auto app_loader = Loader::GetLoader(m_system, nca_raw, program_id, 0);
47 60
48 // Ensure we have a loader which can parse the NCA. 61 // Ensure we have a loader which can parse the NCA.
49 if (!app_loader) { 62 if (!app_loader) {
diff --git a/src/core/hle/service/am/process.h b/src/core/hle/service/am/process.h
index 4b908ade4..4b8102fb6 100755
--- a/src/core/hle/service/am/process.h
+++ b/src/core/hle/service/am/process.h
@@ -21,7 +21,7 @@ public:
21 explicit Process(Core::System& system); 21 explicit Process(Core::System& system);
22 ~Process(); 22 ~Process();
23 23
24 bool Initialize(u64 program_id); 24 bool Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation);
25 void Finalize(); 25 void Finalize();
26 26
27 bool Run(); 27 bool Run();
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index dd769b75f..000001b6d 100755
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -390,4 +390,8 @@ if (ANDROID AND ARCHITECTURE_arm64)
390 target_link_libraries(video_core PRIVATE adrenotools) 390 target_link_libraries(video_core PRIVATE adrenotools)
391endif() 391endif()
392 392
393if (ARCHITECTURE_arm64)
394 target_link_libraries(video_core PRIVATE sse2neon)
395endif()
396
393create_target_directory_groups(video_core) 397create_target_directory_groups(video_core)
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 705285fd0..14d87a9de 100755
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -12,7 +12,10 @@
12#include <immintrin.h> 12#include <immintrin.h>
13#endif 13#endif
14#elif defined(ARCHITECTURE_arm64) 14#elif defined(ARCHITECTURE_arm64)
15#include <arm_neon.h> 15#pragma GCC diagnostic push
16#pragma GCC diagnostic ignored "-Wimplicit-int-conversion"
17#include <sse2neon.h>
18#pragma GCC diagnostic pop
16#endif 19#endif
17 20
18extern "C" { 21extern "C" {
@@ -43,8 +46,6 @@ extern "C" {
43 46
44#if defined(ARCHITECTURE_x86_64) 47#if defined(ARCHITECTURE_x86_64)
45#include "common/x64/cpu_detect.h" 48#include "common/x64/cpu_detect.h"
46#elif defined(ARCHITECTURE_arm64)
47// Some ARM64 detect
48#endif 49#endif
49 50
50namespace Tegra::Host1x { 51namespace Tegra::Host1x {
@@ -244,7 +245,9 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
244 DecodeLinear(); 245 DecodeLinear();
245 return; 246 return;
246 } 247 }
248#endif
247 249
250#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
248 const auto alpha = 251 const auto alpha =
249 _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48); 252 _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48);
250 253
@@ -379,8 +382,6 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot,
379 // clang-format on 382 // clang-format on
380 } 383 }
381 } 384 }
382#elif defined(ARCHITECTURE_arm64)
383 DecodeLinear();
384#else 385#else
385 DecodeLinear(); 386 DecodeLinear();
386#endif 387#endif
@@ -624,7 +625,9 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
624 DecodeLinear(); 625 DecodeLinear();
625 return; 626 return;
626 } 627 }
628#endif
627 629
630#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
628 // Fill the columns, e.g 631 // Fill the columns, e.g
629 // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] 632 // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
630 633
@@ -767,8 +770,6 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) {
767 } 770 }
768 } 771 }
769 // clang-format on 772 // clang-format on
770#elif defined(ARCHITECTURE_arm64)
771 DecodeLinear();
772#else 773#else
773 DecodeLinear(); 774 DecodeLinear();
774#endif 775#endif
@@ -820,7 +821,9 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
820 DecodeLinear(out_luma, out_chroma); 821 DecodeLinear(out_luma, out_chroma);
821 return; 822 return;
822 } 823 }
824#endif
823 825
826#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
824 // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] 827 // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
825 const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); 828 const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
826 829
@@ -947,8 +950,6 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) {
947 // clang-format on 950 // clang-format on
948 } 951 }
949 } 952 }
950#elif defined(ARCHITECTURE_arm64)
951 DecodeLinear(out_luma, out_chroma);
952#else 953#else
953 DecodeLinear(out_luma, out_chroma); 954 DecodeLinear(out_luma, out_chroma);
954#endif 955#endif
@@ -1079,7 +1080,9 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
1079 DecodeLinear(out_buffer); 1080 DecodeLinear(out_buffer);
1080 return; 1081 return;
1081 } 1082 }
1083#endif
1082 1084
1085#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
1083 for (u32 y = 0; y < surface_height; y++) { 1086 for (u32 y = 0; y < surface_height; y++) {
1084 const auto src = y * surface_stride; 1087 const auto src = y * surface_stride;
1085 const auto dst = y * out_luma_stride; 1088 const auto dst = y * out_luma_stride;
@@ -1144,8 +1147,6 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) {
1144 // clang-format on 1147 // clang-format on
1145 } 1148 }
1146 } 1149 }
1147#elif defined(ARCHITECTURE_arm64)
1148 DecodeLinear(out_buffer);
1149#else 1150#else
1150 DecodeLinear(out_buffer); 1151 DecodeLinear(out_buffer);
1151#endif 1152#endif