aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpineappleEA <pineaea@gmail.com>2021-07-08 04:13:21 +0200
committerpineappleEA <pineaea@gmail.com>2021-07-08 04:13:21 +0200
commit8a46c57b52760ee05b8adee50645dba3c99b1cb3 (patch)
treeab50dd3f992cfd607f8e506be82aad3acc6ad2a4
parent67d98725f830aa0e448cfffa330e7826d3a424f8 (diff)
early-access version 1859EA-1859
-rwxr-xr-xREADME.md2
-rwxr-xr-xsrc/tests/video_core/buffer_base.cpp2
-rwxr-xr-xsrc/video_core/buffer_cache/buffer_base.h14
-rwxr-xr-xsrc/video_core/buffer_cache/buffer_cache.h239
-rwxr-xr-xsrc/video_core/dma_pusher.cpp10
-rwxr-xr-xsrc/video_core/fence_manager.h41
-rwxr-xr-xsrc/video_core/gpu.cpp4
-rwxr-xr-xsrc/video_core/gpu_thread.cpp15
-rwxr-xr-xsrc/video_core/gpu_thread.h5
-rwxr-xr-xsrc/video_core/rasterizer_interface.h3
-rwxr-xr-xsrc/video_core/renderer_opengl/gl_rasterizer.cpp8
-rwxr-xr-xsrc/video_core/renderer_opengl/gl_rasterizer.h1
-rwxr-xr-xsrc/video_core/renderer_vulkan/vk_fence_manager.cpp4
-rwxr-xr-xsrc/video_core/renderer_vulkan/vk_rasterizer.cpp10
-rwxr-xr-xsrc/video_core/renderer_vulkan/vk_rasterizer.h1
-rwxr-xr-xsrc/video_core/renderer_vulkan/vk_scheduler.h4
-rwxr-xr-xsrc/video_core/texture_cache/types.h4
17 files changed, 270 insertions, 97 deletions
diff --git a/README.md b/README.md
index 2d33e2b79..42f6be0ac 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
1yuzu emulator early access 1yuzu emulator early access
2============= 2=============
3 3
4This is the source code for early-access 1858. 4This is the source code for early-access 1859.
5 5
6## Legal Notice 6## Legal Notice
7 7
diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp
index edced69bb..cfcdc2253 100755
--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@@ -536,7 +536,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
536 REQUIRE(rasterizer.Count() == 63); 536 REQUIRE(rasterizer.Count() == 63);
537 buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); 537 buffer.MarkRegionAsGpuModified(c + PAGE, PAGE);
538 int num = 0; 538 int num = 0;
539 buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); 539 buffer.ForEachDownloadRange(c, WORD, true, [&](u64 offset, u64 size) { ++num; });
540 buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); 540 buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
541 REQUIRE(num == 0); 541 REQUIRE(num == 0);
542 REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); 542 REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index b121d36a3..9e39858c8 100755
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -226,19 +226,19 @@ public:
226 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified 226 /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
227 template <typename Func> 227 template <typename Func>
228 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { 228 void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
229 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func); 229 ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func);
230 } 230 }
231 231
232 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 232 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
233 template <typename Func> 233 template <typename Func>
234 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { 234 void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) {
235 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func); 235 ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func);
236 } 236 }
237 237
238 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified 238 /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
239 template <typename Func> 239 template <typename Func>
240 void ForEachDownloadRange(Func&& func) { 240 void ForEachDownloadRange(Func&& func) {
241 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func); 241 ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func);
242 } 242 }
243 243
244 /// Mark buffer as picked 244 /// Mark buffer as picked
@@ -415,7 +415,7 @@ private:
415 * @param func Function to call for each turned off region 415 * @param func Function to call for each turned off region
416 */ 416 */
417 template <Type type, typename Func> 417 template <Type type, typename Func>
418 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { 418 void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) {
419 static_assert(type != Type::Untracked); 419 static_assert(type != Type::Untracked);
420 420
421 const s64 difference = query_cpu_range - cpu_addr; 421 const s64 difference = query_cpu_range - cpu_addr;
@@ -467,7 +467,9 @@ private:
467 bits = (bits << left_offset) >> left_offset; 467 bits = (bits << left_offset) >> left_offset;
468 468
469 const u64 current_word = state_words[word_index] & bits; 469 const u64 current_word = state_words[word_index] & bits;
470 state_words[word_index] &= ~bits; 470 if (clear) {
471 state_words[word_index] &= ~bits;
472 }
471 473
472 if constexpr (type == Type::CPU) { 474 if constexpr (type == Type::CPU) {
473 const u64 current_bits = untracked_words[word_index] & bits; 475 const u64 current_bits = untracked_words[word_index] & bits;
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index cad7f902d..f04538dca 100755
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -15,6 +15,7 @@
15#include <vector> 15#include <vector>
16 16
17#include <boost/container/small_vector.hpp> 17#include <boost/container/small_vector.hpp>
18#include <boost/icl/interval_set.hpp>
18 19
19#include "common/common_types.h" 20#include "common/common_types.h"
20#include "common/div_ceil.h" 21#include "common/div_ceil.h"
@@ -77,6 +78,9 @@ class BufferCache {
77 using Runtime = typename P::Runtime; 78 using Runtime = typename P::Runtime;
78 using Buffer = typename P::Buffer; 79 using Buffer = typename P::Buffer;
79 80
81 using IntervalSet = boost::icl::interval_set<VAddr>;
82 using IntervalType = typename IntervalSet::interval_type;
83
80 struct Empty {}; 84 struct Empty {};
81 85
82 struct OverlapResult { 86 struct OverlapResult {
@@ -148,11 +152,14 @@ public:
148 /// Return true when there are uncommitted buffers to be downloaded 152 /// Return true when there are uncommitted buffers to be downloaded
149 [[nodiscard]] bool HasUncommittedFlushes() const noexcept; 153 [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
150 154
155 void AccumulateFlushes();
156
151 /// Return true when the caller should wait for async downloads 157 /// Return true when the caller should wait for async downloads
152 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; 158 [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
153 159
154 /// Commit asynchronous downloads 160 /// Commit asynchronous downloads
155 void CommitAsyncFlushes(); 161 void CommitAsyncFlushes();
162 void CommitAsyncFlushesHigh();
156 163
157 /// Pop asynchronous downloads 164 /// Pop asynchronous downloads
158 void PopAsyncFlushes(); 165 void PopAsyncFlushes();
@@ -160,6 +167,9 @@ public:
160 /// Return true when a CPU region is modified from the GPU 167 /// Return true when a CPU region is modified from the GPU
161 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); 168 [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
162 169
170 /// Return true when a CPU region is modified from the GPU
171 [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
172
163 std::mutex mutex; 173 std::mutex mutex;
164 174
165private: 175private:
@@ -272,8 +282,6 @@ private:
272 282
273 void DeleteBuffer(BufferId buffer_id); 283 void DeleteBuffer(BufferId buffer_id);
274 284
275 void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
276
277 void NotifyBufferDeletion(); 285 void NotifyBufferDeletion();
278 286
279 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; 287 [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
@@ -327,9 +335,9 @@ private:
327 335
328 std::vector<BufferId> cached_write_buffer_ids; 336 std::vector<BufferId> cached_write_buffer_ids;
329 337
330 // TODO: This data structure is not optimal and it should be reworked 338 IntervalSet uncommitted_ranges;
331 std::vector<BufferId> uncommitted_downloads; 339 IntervalSet common_ranges;
332 std::deque<std::vector<BufferId>> committed_downloads; 340 std::deque<IntervalSet> committed_ranges;
333 341
334 size_t immediate_buffer_capacity = 0; 342 size_t immediate_buffer_capacity = 0;
335 std::unique_ptr<u8[]> immediate_buffer_alloc; 343 std::unique_ptr<u8[]> immediate_buffer_alloc;
@@ -352,6 +360,7 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
352 // Ensure the first slot is used for the null buffer 360 // Ensure the first slot is used for the null buffer
353 void(slot_buffers.insert(runtime, NullBufferParams{})); 361 void(slot_buffers.insert(runtime, NullBufferParams{}));
354 deletion_iterator = slot_buffers.end(); 362 deletion_iterator = slot_buffers.end();
363 common_ranges.clear();
355} 364}
356 365
357template <class P> 366template <class P>
@@ -547,29 +556,30 @@ void BufferCache<P>::FlushCachedWrites() {
547 556
548template <class P> 557template <class P>
549bool BufferCache<P>::HasUncommittedFlushes() const noexcept { 558bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
550 return !uncommitted_downloads.empty(); 559 return !uncommitted_ranges.empty() || !committed_ranges.empty();
551} 560}
552 561
553template <class P> 562template <class P>
554bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { 563void BufferCache<P>::AccumulateFlushes() {
555 return !committed_downloads.empty() && !committed_downloads.front().empty(); 564 if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
565 uncommitted_ranges.clear();
566 return;
567 }
568 if (uncommitted_ranges.empty()) {
569 return;
570 }
571 committed_ranges.emplace_back(std::move(uncommitted_ranges));
556} 572}
557 573
558template <class P> 574template <class P>
559void BufferCache<P>::CommitAsyncFlushes() { 575bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
560 // This is intentionally passing the value by copy 576 return false;
561 committed_downloads.push_front(uncommitted_downloads);
562 uncommitted_downloads.clear();
563} 577}
564 578
565template <class P> 579template <class P>
566void BufferCache<P>::PopAsyncFlushes() { 580void BufferCache<P>::CommitAsyncFlushesHigh() {
567 if (committed_downloads.empty()) { 581 AccumulateFlushes();
568 return; 582 if (committed_ranges.empty()) {
569 }
570 auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
571 const std::span<const BufferId> download_ids = committed_downloads.back();
572 if (download_ids.empty()) {
573 return; 583 return;
574 } 584 }
575 MICROPROFILE_SCOPE(GPU_DownloadMemory); 585 MICROPROFILE_SCOPE(GPU_DownloadMemory);
@@ -577,20 +587,67 @@ void BufferCache<P>::PopAsyncFlushes() {
577 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads; 587 boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
578 u64 total_size_bytes = 0; 588 u64 total_size_bytes = 0;
579 u64 largest_copy = 0; 589 u64 largest_copy = 0;
580 for (const BufferId buffer_id : download_ids) { 590 for (const IntervalSet& intervals : committed_ranges) {
581 slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { 591 for (auto& interval : intervals) {
582 downloads.push_back({ 592 const std::size_t size = interval.upper() - interval.lower();
583 BufferCopy{ 593 const VAddr cpu_addr = interval.lower();
584 .src_offset = range_offset, 594 const VAddr cpu_addr_end = interval.upper();
585 .dst_offset = total_size_bytes, 595 ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
586 .size = range_size, 596 boost::container::small_vector<BufferCopy, 1> copies;
587 }, 597 buffer.ForEachDownloadRange(
588 buffer_id, 598 cpu_addr, size, true, [&](u64 range_offset, u64 range_size) {
599 const VAddr buffer_addr = buffer.CpuAddr();
600 const auto add_download = [&](VAddr start, VAddr end) {
601 const u64 new_offset = start - buffer_addr;
602 const u64 new_size = end - start;
603 downloads.push_back({
604 BufferCopy{
605 .src_offset = new_offset,
606 .dst_offset = total_size_bytes,
607 .size = new_size,
608 },
609 buffer_id,
610 });
611 // Align up to avoid cache conflicts
612 constexpr u64 align = 256ULL;
613 constexpr u64 mask = ~(align - 1ULL);
614 total_size_bytes += (new_size + align - 1) & mask;
615 largest_copy = std::max(largest_copy, new_size);
616 };
617
618 const VAddr start_address = buffer_addr + range_offset;
619 const VAddr end_address = start_address + range_size;
620 const IntervalType search_interval{cpu_addr, 1};
621 auto it = common_ranges.lower_bound(search_interval);
622 if (it == common_ranges.end()) {
623 it = common_ranges.begin();
624 }
625 while (it != common_ranges.end()) {
626 VAddr inter_addr_end = it->upper();
627 VAddr inter_addr = it->lower();
628 if (inter_addr >= end_address) {
629 break;
630 }
631 if (inter_addr_end <= start_address) {
632 it++;
633 continue;
634 }
635 if (inter_addr_end > end_address) {
636 inter_addr_end = end_address;
637 }
638 if (inter_addr < start_address) {
639 inter_addr = start_address;
640 }
641 add_download(inter_addr, inter_addr_end);
642 it++;
643 }
644 const IntervalType subtract_interval{start_address, end_address};
645 common_ranges.subtract(subtract_interval);
646 });
589 }); 647 });
590 total_size_bytes += range_size; 648 }
591 largest_copy = std::max(largest_copy, range_size);
592 });
593 } 649 }
650 committed_ranges.clear();
594 if (downloads.empty()) { 651 if (downloads.empty()) {
595 return; 652 return;
596 } 653 }
@@ -623,6 +680,19 @@ void BufferCache<P>::PopAsyncFlushes() {
623} 680}
624 681
625template <class P> 682template <class P>
683void BufferCache<P>::CommitAsyncFlushes() {
684 if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
685 CommitAsyncFlushesHigh();
686 } else {
687 uncommitted_ranges.clear();
688 committed_ranges.clear();
689 }
690}
691
692template <class P>
693void BufferCache<P>::PopAsyncFlushes() {}
694
695template <class P>
626bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { 696bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
627 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); 697 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
628 for (u64 page = addr >> PAGE_BITS; page < page_end;) { 698 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
@@ -642,6 +712,25 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
642} 712}
643 713
644template <class P> 714template <class P>
715bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
716 const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
717 for (u64 page = addr >> PAGE_BITS; page < page_end;) {
718 const BufferId image_id = page_table[page];
719 if (!image_id) {
720 ++page;
721 continue;
722 }
723 Buffer& buffer = slot_buffers[image_id];
724 if (buffer.IsRegionCpuModified(addr, size)) {
725 return true;
726 }
727 const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
728 page = Common::DivCeil(end_addr, PAGE_SIZE);
729 }
730 return false;
731}
732
733template <class P>
645void BufferCache<P>::BindHostIndexBuffer() { 734void BufferCache<P>::BindHostIndexBuffer() {
646 Buffer& buffer = slot_buffers[index_buffer.buffer_id]; 735 Buffer& buffer = slot_buffers[index_buffer.buffer_id];
647 TouchBuffer(buffer); 736 TouchBuffer(buffer);
@@ -1010,16 +1099,16 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
1010 Buffer& buffer = slot_buffers[buffer_id]; 1099 Buffer& buffer = slot_buffers[buffer_id];
1011 buffer.MarkRegionAsGpuModified(cpu_addr, size); 1100 buffer.MarkRegionAsGpuModified(cpu_addr, size);
1012 1101
1013 const bool is_accuracy_high = Settings::IsGPULevelHigh(); 1102 const IntervalType base_interval{cpu_addr, cpu_addr + size};
1103 common_ranges.add(base_interval);
1104
1105 const bool is_accuracy_high =
1106 Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
1014 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); 1107 const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
1015 if (!is_accuracy_high || !is_async) { 1108 if (!is_async && !is_accuracy_high) {
1016 return;
1017 }
1018 if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
1019 // Already inserted
1020 return; 1109 return;
1021 } 1110 }
1022 uncommitted_downloads.push_back(buffer_id); 1111 uncommitted_ranges.add(base_interval);
1023} 1112}
1024 1113
1025template <class P> 1114template <class P>
@@ -1103,7 +1192,6 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
1103 if (!copies.empty()) { 1192 if (!copies.empty()) {
1104 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); 1193 runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
1105 } 1194 }
1106 ReplaceBufferDownloads(overlap_id, new_buffer_id);
1107 DeleteBuffer(overlap_id); 1195 DeleteBuffer(overlap_id);
1108} 1196}
1109 1197
@@ -1244,14 +1332,51 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
1244 boost::container::small_vector<BufferCopy, 1> copies; 1332 boost::container::small_vector<BufferCopy, 1> copies;
1245 u64 total_size_bytes = 0; 1333 u64 total_size_bytes = 0;
1246 u64 largest_copy = 0; 1334 u64 largest_copy = 0;
1247 buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { 1335 buffer.ForEachDownloadRange(cpu_addr, size, true, [&](u64 range_offset, u64 range_size) {
1248 copies.push_back(BufferCopy{ 1336 const VAddr buffer_addr = buffer.CpuAddr();
1249 .src_offset = range_offset, 1337 const auto add_download = [&](VAddr start, VAddr end) {
1250 .dst_offset = total_size_bytes, 1338 const u64 new_offset = start - buffer_addr;
1251 .size = range_size, 1339 const u64 new_size = end - start;
1252 }); 1340 copies.push_back(BufferCopy{
1253 total_size_bytes += range_size; 1341 .src_offset = new_offset,
1254 largest_copy = std::max(largest_copy, range_size); 1342 .dst_offset = total_size_bytes,
1343 .size = new_size,
1344 });
1345 // Align up to avoid cache conflicts
1346 constexpr u64 align = 256ULL;
1347 constexpr u64 mask = ~(align - 1ULL);
1348 total_size_bytes += (new_size + align - 1) & mask;
1349 largest_copy = std::max(largest_copy, new_size);
1350 };
1351
1352 const VAddr start_address = buffer_addr + range_offset;
1353 const VAddr end_address = start_address + range_size;
1354 const IntervalType search_interval{start_address - range_size, 1};
1355 auto it = common_ranges.lower_bound(search_interval);
1356 if (it == common_ranges.end()) {
1357 it = common_ranges.begin();
1358 }
1359 while (it != common_ranges.end()) {
1360 VAddr inter_addr_end = it->upper();
1361 VAddr inter_addr = it->lower();
1362 if (inter_addr >= end_address) {
1363 break;
1364 }
1365 if (inter_addr_end <= start_address) {
1366 it++;
1367 continue;
1368 }
1369 if (inter_addr_end > end_address) {
1370 inter_addr_end = end_address;
1371 }
1372 if (inter_addr < start_address) {
1373 inter_addr = start_address;
1374 }
1375 add_download(inter_addr, inter_addr_end);
1376 it++;
1377 }
1378 const IntervalType subtract_interval{start_address, end_address};
1379 common_ranges.subtract(subtract_interval);
1255 }); 1380 });
1256 if (total_size_bytes == 0) { 1381 if (total_size_bytes == 0) {
1257 return; 1382 return;
@@ -1316,18 +1441,6 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
1316} 1441}
1317 1442
1318template <class P> 1443template <class P>
1319void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
1320 const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
1321 std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
1322 if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
1323 buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
1324 }
1325 };
1326 replace(uncommitted_downloads);
1327 std::ranges::for_each(committed_downloads, replace);
1328}
1329
1330template <class P>
1331void BufferCache<P>::NotifyBufferDeletion() { 1444void BufferCache<P>::NotifyBufferDeletion() {
1332 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { 1445 if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
1333 dirty_uniform_buffers.fill(~u32{0}); 1446 dirty_uniform_buffers.fill(~u32{0});
@@ -1349,15 +1462,9 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
1349 if (!cpu_addr || size == 0) { 1462 if (!cpu_addr || size == 0) {
1350 return NULL_BINDING; 1463 return NULL_BINDING;
1351 } 1464 }
1352 // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
1353 // It exists due to some games like Astral Chain operate out of bounds.
1354 // Binding the whole map range would be technically correct, but games have large maps that make
1355 // this approach unaffordable for now.
1356 static constexpr u32 arbitrary_extra_bytes = 0xc000;
1357 const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
1358 const Binding binding{ 1465 const Binding binding{
1359 .cpu_addr = *cpu_addr, 1466 .cpu_addr = *cpu_addr,
1360 .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end), 1467 .size = size,
1361 .buffer_id = BufferId{}, 1468 .buffer_id = BufferId{},
1362 }; 1469 };
1363 return binding; 1470 return binding;
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 8b33c04ab..8d28bd884 100755
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -4,6 +4,7 @@
4 4
5#include "common/cityhash.h" 5#include "common/cityhash.h"
6#include "common/microprofile.h" 6#include "common/microprofile.h"
7#include "common/settings.h"
7#include "core/core.h" 8#include "core/core.h"
8#include "core/memory.h" 9#include "core/memory.h"
9#include "video_core/dma_pusher.h" 10#include "video_core/dma_pusher.h"
@@ -76,8 +77,13 @@ bool DmaPusher::Step() {
76 77
77 // Push buffer non-empty, read a word 78 // Push buffer non-empty, read a word
78 command_headers.resize(command_list_header.size); 79 command_headers.resize(command_list_header.size);
79 gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), 80 if (Settings::IsGPULevelHigh()) {
80 command_list_header.size * sizeof(u32)); 81 gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
82 command_list_header.size * sizeof(u32));
83 } else {
84 gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
85 command_list_header.size * sizeof(u32));
86 }
81 } 87 }
82 for (std::size_t index = 0; index < command_headers.size();) { 88 for (std::size_t index = 0; index < command_headers.size();) {
83 const CommandHeader& command_header = command_headers[index]; 89 const CommandHeader& command_header = command_headers[index];
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index f055b61e9..be1bc7f64 100755
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -8,6 +8,7 @@
8#include <queue> 8#include <queue>
9 9
10#include "common/common_types.h" 10#include "common/common_types.h"
11#include "common/settings.h"
11#include "core/core.h" 12#include "core/core.h"
12#include "video_core/delayed_destruction_ring.h" 13#include "video_core/delayed_destruction_ring.h"
13#include "video_core/gpu.h" 14#include "video_core/gpu.h"
@@ -53,6 +54,12 @@ public:
53 delayed_destruction_ring.Tick(); 54 delayed_destruction_ring.Tick();
54 } 55 }
55 56
57 // Unlike other fences, this one doesn't
58 void SignalOrdering() {
59 std::scoped_lock lock{buffer_cache.mutex};
60 buffer_cache.AccumulateFlushes();
61 }
62
56 void SignalSemaphore(GPUVAddr addr, u32 value) { 63 void SignalSemaphore(GPUVAddr addr, u32 value) {
57 TryReleasePendingFences(); 64 TryReleasePendingFences();
58 const bool should_flush = ShouldFlush(); 65 const bool should_flush = ShouldFlush();
@@ -96,6 +103,23 @@ public:
96 } 103 }
97 } 104 }
98 105
106 void TryReleasePendingFences() {
107 while (!fences.empty()) {
108 TFence& current_fence = fences.front();
109 if (ShouldWait() && !IsFenceSignaled(current_fence)) {
110 return;
111 }
112 PopAsyncFlushes();
113 if (current_fence->IsSemaphore()) {
114 gpu_memory.template Write<u32>(current_fence->GetAddress(),
115 current_fence->GetPayload());
116 } else {
117 gpu.IncrementSyncPoint(current_fence->GetPayload());
118 }
119 PopFence();
120 }
121 }
122
99protected: 123protected:
100 explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, 124 explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
101 TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, 125 TTextureCache& texture_cache_, TTBufferCache& buffer_cache_,
@@ -125,23 +149,6 @@ protected:
125 TQueryCache& query_cache; 149 TQueryCache& query_cache;
126 150
127private: 151private:
128 void TryReleasePendingFences() {
129 while (!fences.empty()) {
130 TFence& current_fence = fences.front();
131 if (ShouldWait() && !IsFenceSignaled(current_fence)) {
132 return;
133 }
134 PopAsyncFlushes();
135 if (current_fence->IsSemaphore()) {
136 gpu_memory.template Write<u32>(current_fence->GetAddress(),
137 current_fence->GetPayload());
138 } else {
139 gpu.IncrementSyncPoint(current_fence->GetPayload());
140 }
141 PopFence();
142 }
143 }
144
145 bool ShouldWait() const { 152 bool ShouldWait() const {
146 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; 153 std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
147 return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || 154 return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 35cc561be..f317ddc2b 100755
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -268,11 +268,13 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
268 case BufferMethods::SemaphoreAddressHigh: 268 case BufferMethods::SemaphoreAddressHigh:
269 case BufferMethods::SemaphoreAddressLow: 269 case BufferMethods::SemaphoreAddressLow:
270 case BufferMethods::SemaphoreSequence: 270 case BufferMethods::SemaphoreSequence:
271 case BufferMethods::RefCnt:
272 case BufferMethods::UnkCacheFlush: 271 case BufferMethods::UnkCacheFlush:
273 case BufferMethods::WrcacheFlush: 272 case BufferMethods::WrcacheFlush:
274 case BufferMethods::FenceValue: 273 case BufferMethods::FenceValue:
275 break; 274 break;
275 case BufferMethods::RefCnt:
276 rasterizer->SignalReference();
277 break;
276 case BufferMethods::FenceAction: 278 case BufferMethods::FenceAction:
277 ProcessFenceActionMethod(); 279 ProcessFenceActionMethod();
278 break; 280 break;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 46f642b19..25c0d30dd 100755
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -8,6 +8,7 @@
8#include "common/settings.h" 8#include "common/settings.h"
9#include "common/thread.h" 9#include "common/thread.h"
10#include "core/core.h" 10#include "core/core.h"
11#include "core/core_timing.h"
11#include "core/frontend/emu_window.h" 12#include "core/frontend/emu_window.h"
12#include "video_core/dma_pusher.h" 13#include "video_core/dma_pusher.h"
13#include "video_core/gpu.h" 14#include "video_core/gpu.h"
@@ -83,6 +84,17 @@ void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
83 rasterizer = renderer.ReadRasterizer(); 84 rasterizer = renderer.ReadRasterizer();
84 thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), 85 thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
85 std::ref(dma_pusher), std::ref(state)); 86 std::ref(dma_pusher), std::ref(state));
87 gpu_sync_event = Core::Timing::CreateEvent(
88 "GPUHostSyncCallback", [this](std::uintptr_t, std::chrono::nanoseconds) {
89 if (!state.is_running) {
90 return;
91 }
92
93 OnCommandListEnd();
94 const auto time_interval = std::chrono::nanoseconds{500 * 1000};
95 system.CoreTiming().ScheduleEvent(time_interval, gpu_sync_event);
96 });
97 system.CoreTiming().ScheduleEvent(std::chrono::nanoseconds{500 * 1000}, gpu_sync_event);
86} 98}
87 99
88void ThreadManager::SubmitList(Tegra::CommandList&& entries) { 100void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
@@ -128,6 +140,9 @@ void ThreadManager::ShutDown() {
128 state.cv.notify_all(); 140 state.cv.notify_all();
129 } 141 }
130 142
143 system.CoreTiming().UnscheduleEvent(gpu_sync_event, 0);
144 system.CoreTiming().RemoveEvent(gpu_sync_event);
145
131 if (!thread.joinable()) { 146 if (!thread.joinable()) {
132 return; 147 return;
133 } 148 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 11a648f38..ea98df3b1 100755
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -20,6 +20,10 @@ class DmaPusher;
20} // namespace Tegra 20} // namespace Tegra
21 21
22namespace Core { 22namespace Core {
23namespace Timing {
24class CoreTiming;
25struct EventType;
26} // namespace Timing
23namespace Frontend { 27namespace Frontend {
24class GraphicsContext; 28class GraphicsContext;
25} 29}
@@ -150,6 +154,7 @@ private:
150 154
151 SynchState state; 155 SynchState state;
152 std::thread thread; 156 std::thread thread;
157 std::shared_ptr<Core::Timing::EventType> gpu_sync_event;
153}; 158};
154 159
155} // namespace VideoCommon::GPUThread 160} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index fcbdacae3..554dd70b6 100755
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -63,6 +63,9 @@ public:
63 /// Signal a GPU based syncpoint as a fence 63 /// Signal a GPU based syncpoint as a fence
64 virtual void SignalSyncPoint(u32 value) = 0; 64 virtual void SignalSyncPoint(u32 value) = 0;
65 65
66 /// Signal a GPU based reference as point
67 virtual void SignalReference() = 0;
68
66 /// Release all pending fences. 69 /// Release all pending fences.
67 virtual void ReleaseFences() = 0; 70 virtual void ReleaseFences() = 0;
68 71
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 99bd7ac9c..183861a23 100755
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -654,6 +654,13 @@ void RasterizerOpenGL::SignalSyncPoint(u32 value) {
654 fence_manager.SignalSyncPoint(value); 654 fence_manager.SignalSyncPoint(value);
655} 655}
656 656
657void RasterizerOpenGL::SignalReference() {
658 if (!gpu.IsAsync()) {
659 return;
660 }
661 fence_manager.SignalOrdering();
662}
663
657void RasterizerOpenGL::ReleaseFences() { 664void RasterizerOpenGL::ReleaseFences() {
658 if (!gpu.IsAsync()) { 665 if (!gpu.IsAsync()) {
659 return; 666 return;
@@ -670,6 +677,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
670 677
671void RasterizerOpenGL::WaitForIdle() { 678void RasterizerOpenGL::WaitForIdle() {
672 glMemoryBarrier(GL_ALL_BARRIER_BITS); 679 glMemoryBarrier(GL_ALL_BARRIER_BITS);
680 SignalReference();
673} 681}
674 682
675void RasterizerOpenGL::FragmentBarrier() { 683void RasterizerOpenGL::FragmentBarrier() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index ad7326ece..87f44cd62 100755
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -85,6 +85,7 @@ public:
85 void ModifyGPUMemory(GPUVAddr addr, u64 size) override; 85 void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
86 void SignalSemaphore(GPUVAddr addr, u32 value) override; 86 void SignalSemaphore(GPUVAddr addr, u32 value) override;
87 void SignalSyncPoint(u32 value) override; 87 void SignalSyncPoint(u32 value) override;
88 void SignalReference() override;
88 void ReleaseFences() override; 89 void ReleaseFences() override;
89 void FlushAndInvalidateRegion(VAddr addr, u64 size) override; 90 void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
90 void WaitForIdle() override; 91 void WaitForIdle() override;
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
index 3bec48d14..c2d6676e7 100755
--- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -34,6 +34,10 @@ bool InnerFence::IsSignaled() const {
34 if (is_stubbed) { 34 if (is_stubbed) {
35 return true; 35 return true;
36 } 36 }
37 if (scheduler.IsFree(wait_tick)) {
38 return true;
39 }
40 scheduler.Refresh();
37 return scheduler.IsFree(wait_tick); 41 return scheduler.IsFree(wait_tick);
38} 42}
39 43
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 8ae2202bd..4fbf93ca0 100755
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -600,11 +600,18 @@ void RasterizerVulkan::SignalSyncPoint(u32 value) {
600 fence_manager.SignalSyncPoint(value); 600 fence_manager.SignalSyncPoint(value);
601} 601}
602 602
603void RasterizerVulkan::SignalReference() {
604 if (!gpu.IsAsync()) {
605 return;
606 }
607 fence_manager.SignalOrdering();
608}
609
603void RasterizerVulkan::ReleaseFences() { 610void RasterizerVulkan::ReleaseFences() {
604 if (!gpu.IsAsync()) { 611 if (!gpu.IsAsync()) {
605 return; 612 return;
606 } 613 }
607 fence_manager.WaitPendingFences(); 614 fence_manager.TryReleasePendingFences();
608} 615}
609 616
610void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) { 617void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) {
@@ -632,6 +639,7 @@ void RasterizerVulkan::WaitForIdle() {
632 cmdbuf.SetEvent(event, flags); 639 cmdbuf.SetEvent(event, flags);
633 cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); 640 cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {});
634 }); 641 });
642 SignalReference();
635} 643}
636 644
637void RasterizerVulkan::FragmentBarrier() { 645void RasterizerVulkan::FragmentBarrier() {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index a29022a50..d15c36ddc 100755
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -77,6 +77,7 @@ public:
77 void ModifyGPUMemory(GPUVAddr addr, u64 size) override; 77 void ModifyGPUMemory(GPUVAddr addr, u64 size) override;
78 void SignalSemaphore(GPUVAddr addr, u32 value) override; 78 void SignalSemaphore(GPUVAddr addr, u32 value) override;
79 void SignalSyncPoint(u32 value) override; 79 void SignalSyncPoint(u32 value) override;
80 void SignalReference() override;
80 void ReleaseFences() override; 81 void ReleaseFences() override;
81 void FlushAndInvalidateRegion(VAddr addr, u64 size) override; 82 void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
82 void WaitForIdle() override; 83 void WaitForIdle() override;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 3ce48e9d2..9e0a1d4e6 100755
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -83,6 +83,10 @@ public:
83 return master_semaphore->IsFree(tick); 83 return master_semaphore->IsFree(tick);
84 } 84 }
85 85
86 void Refresh() const noexcept {
87 return master_semaphore->Refresh();
88 }
89
86 /// Waits for the given tick to trigger on the GPU. 90 /// Waits for the given tick to trigger on the GPU.
87 void Wait(u64 tick) { 91 void Wait(u64 tick) {
88 master_semaphore->Wait(tick); 92 master_semaphore->Wait(tick);
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index 9fbdc1ac6..47a11cb2f 100755
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -133,8 +133,8 @@ struct BufferImageCopy {
133}; 133};
134 134
135struct BufferCopy { 135struct BufferCopy {
136 size_t src_offset; 136 u64 src_offset;
137 size_t dst_offset; 137 u64 dst_offset;
138 size_t size; 138 size_t size;
139}; 139};
140 140