diff options
| author | Stefan Boberg <[email protected]> | 2021-05-11 13:05:39 +0200 |
|---|---|---|
| committer | Stefan Boberg <[email protected]> | 2021-05-11 13:05:39 +0200 |
| commit | f8d9ac5d13dd37b8b57af0478e77ba1e75c813aa (patch) | |
| tree | 1daf7621e110d48acd5e12e3073ce48ef0dd11b2 | |
| download | zen-f8d9ac5d13dd37b8b57af0478e77ba1e75c813aa.tar.xz zen-f8d9ac5d13dd37b8b57af0478e77ba1e75c813aa.zip | |
Adding zenservice code
300 files changed, 81388 insertions, 0 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..f1b47c7d7 --- /dev/null +++ b/.clang-format @@ -0,0 +1,182 @@ +--- +Language: Cpp +# BasedOnStyle: Chromium +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: true +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true +AlignConsecutiveDeclarations: true +AlignEscapedNewlines: Left +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortEnumsOnASingleLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: InlineOnly +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: TopLevelDefinitions +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Custom +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: Always + AfterEnum: false + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: true + AfterExternBlock: true + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeComma +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 140 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: true +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 0 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^<ext/.*\.h>' + Priority: 2 + SortPriority: 0 + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentCaseLabels: true +IndentCaseBlocks: true +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentExternBlock: AfterExternBlock +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: 'UE_TRACE_EVENT_BEGIN' +MacroBlockEnd: 'UE_TRACE_EVENT_END' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + - ParseTestProto + - ParsePartialTestProto + CanonicalDelimiter: '' + BasedOnStyle: google +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Auto +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 4 +UseCRLF: false +UseTab: Always +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE +... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..5e808b94b --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,6 @@ +--- +Checks: '-*,modernize-avoid-bind,modernize-avoid-c-arrays,modernize-concat-nested-namespaces,modernize-deprecated-headers,modernize-deprecated-ios-base-aliases,modernize-loop-convert,modernize-make-shared,modernize-make-unique,modernize-pass-by-value,modernize-raw-string-literal,modernize-redundant-void-arg,modernize-replace-auto-ptr,modernize-replace-random-shuffle,modernize-return-braced-init-list,modernize-shrink-to-fit,modernize-unary-static-assert,modernize-use-auto,modernize-use-bool-literals,modernize-use-default-member-init,modernize-use-emplace,modernize-use-equals-default,modernize-use-equals-delete,modernize-use-nodiscard,modernize-use-noexcept,modernize-use-nullptr,modernize-use-override,modernize-use-transparent-functors,modernize-use-uncaught-exceptions,modernize-use-using,readability-avoid-const-params-in-decls,readability-braces-around-statements,readability-const-return-type,readability-container-size-empty,readability-convert-member-functions-to-static,readability-deleted-default,readability-delete-null-pointer,readability-else-after-return,readability-identifier-naming,readability-implicit-bool-conversion,readability-inconsistent-declaration-parameter-name,readability-isolate-declaration,readability-magic-numbers,readability-misleading-indentation,readability-misplaced-array-index,readability-named-parameter,readability-non-const-parameter,readability-redundant-control-flow,readability-redundant-declaration,readability-redundant-function-ptr-dereference,readability-redundant-member-init,readability-redundant-preprocessor,readability-redundant-smartptr-get,readability-redundant-string-cstr,readability-redundant-string-init,readability-simplify-boolean-expr,readability-simplify-subscript-expr,readability-static-accessed-through-instance,readability-static-definition-in-anonymous-namespace,readability-string-compare,readability-uniqueptr-delete-release' +WarningsAsErrors: '' +HeaderFilterRegex: '.*' +FormatStyle: 'file' +User: stefa diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..11e36773f --- /dev/null +++ b/.editorconfig @@ -0,0 +1,69 @@ +# Visual Studio generated .editorconfig file with C++ settings. +root = true + +[*.{c++,cc,cpp,cppm,cxx,h,h++,hh,hpp,hxx,inl,ipp,ixx,tlh,tli}] + +# Visual C++ Code Style settings + +cpp_generate_documentation_comments = xml + +# Visual C++ Formatting settings + +cpp_indent_braces = false +cpp_indent_multi_line_relative_to = innermost_parenthesis +cpp_indent_within_parentheses = indent +cpp_indent_preserve_within_parentheses = true +cpp_indent_case_contents = true +cpp_indent_case_labels = false +cpp_indent_case_contents_when_block = false +cpp_indent_lambda_braces_when_parameter = false +cpp_indent_goto_labels = one_left +cpp_indent_preprocessor = leftmost_column +cpp_indent_access_specifiers = false +cpp_indent_namespace_contents = true +cpp_indent_preserve_comments = false +cpp_new_line_before_open_brace_namespace = ignore +cpp_new_line_before_open_brace_type = ignore +cpp_new_line_before_open_brace_function = ignore +cpp_new_line_before_open_brace_block = ignore +cpp_new_line_before_open_brace_lambda = ignore +cpp_new_line_scope_braces_on_separate_lines = false +cpp_new_line_close_brace_same_line_empty_type = false +cpp_new_line_close_brace_same_line_empty_function = false +cpp_new_line_before_catch = true +cpp_new_line_before_else = true +cpp_new_line_before_while_in_do_while = false +cpp_space_before_function_open_parenthesis = remove +cpp_space_within_parameter_list_parentheses = false +cpp_space_between_empty_parameter_list_parentheses = false +cpp_space_after_keywords_in_control_flow_statements = true +cpp_space_within_control_flow_statement_parentheses = false +cpp_space_before_lambda_open_parenthesis = false +cpp_space_within_cast_parentheses = false +cpp_space_after_cast_close_parenthesis = false +cpp_space_within_expression_parentheses = false +cpp_space_before_block_open_brace = true +cpp_space_between_empty_braces = false +cpp_space_before_initializer_list_open_brace = false +cpp_space_within_initializer_list_braces = true +cpp_space_preserve_in_initializer_list = true +cpp_space_before_open_square_bracket = false +cpp_space_within_square_brackets = false +cpp_space_before_empty_square_brackets = false +cpp_space_between_empty_square_brackets = false +cpp_space_group_square_brackets = true +cpp_space_within_lambda_brackets = false +cpp_space_between_empty_lambda_brackets = false +cpp_space_before_comma = false +cpp_space_after_comma = true +cpp_space_remove_around_member_operators = true +cpp_space_before_inheritance_colon = true +cpp_space_before_constructor_colon = true +cpp_space_remove_before_semicolon = true +cpp_space_after_semicolon = true +cpp_space_remove_around_unary_operator = true +cpp_space_around_binary_operator = insert +cpp_space_around_assignment_operator = insert +cpp_space_pointer_reference_alignment = left +cpp_space_around_ternary_operator = insert +cpp_wrap_preserve_blocks = one_liners diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..4395cd288 --- /dev/null +++ b/.gitignore @@ -0,0 +1,213 @@ +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ +**/Properties/launchSettings.json + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + + + + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + + + + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + + + + + + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + + +/vcpkg_installed +.data/ +.minio_data/ +.test/ diff --git a/3rdparty/BLAKE3/.github/workflows/build_b3sum.py b/3rdparty/BLAKE3/.github/workflows/build_b3sum.py new file mode 100644 index 000000000..e487daf97 --- /dev/null +++ b/3rdparty/BLAKE3/.github/workflows/build_b3sum.py @@ -0,0 +1,37 @@ +#! /usr/bin/env python3 + +from pathlib import Path +import platform +import shutil +import subprocess +import sys + +ROOT = Path(__file__).parent.parent.parent +RUST_TARGET = sys.argv[1] + +subprocess.run(["cargo", "build", "--target", sys.argv[1], "--release"], + cwd=ROOT / "b3sum") + +if platform.system() == "Windows": + original_exe_name = "b3sum.exe" +else: + original_exe_name = "b3sum" + +if platform.system() == "Windows": + new_exe_name = "b3sum_windows_x64_bin.exe" +elif platform.system() == "Darwin": + new_exe_name = "b3sum_macos_x64_bin" +elif platform.system() == "Linux": + new_exe_name = "b3sum_linux_x64_bin" +else: + raise RuntimeError("Unexpected platform: " + platform.system()) + +# Copy the built binary so that it has the upload name we want. +out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" +original_exe_path = str(out_dir / original_exe_name) +new_exe_path = str(out_dir / new_exe_name) +print("copying", repr(original_exe_path), "to", repr(new_exe_path)) +shutil.copyfile(original_exe_path, new_exe_path) + +# This lets the subsequent upload step get the filepath. +print("::set-output name=bin_path::" + new_exe_path) diff --git a/3rdparty/BLAKE3/.github/workflows/ci.yml b/3rdparty/BLAKE3/.github/workflows/ci.yml new file mode 100644 index 000000000..464a411d5 --- /dev/null +++ b/3rdparty/BLAKE3/.github/workflows/ci.yml @@ -0,0 +1,208 @@ +name: tests + +on: + push: + branches: + - "*" + # not on tags + pull_request: + +env: + BLAKE3_CI: "1" + RUSTFLAGS: "-D warnings" + RUST_BACKTRACE: "1" + +jobs: + cargo_tests: + name: ${{ matrix.target.name }} ${{ matrix.channel }} + runs-on: ${{ matrix.target.os }} + strategy: + fail-fast: false + matrix: + target: [ + { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, + { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } + ] + channel: [stable, beta, nightly] + + steps: + - uses: actions/checkout@v1 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} + profile: minimal + override: true + # Print the compiler version, for debugging. + - name: print compiler version + run: cargo run --quiet + working-directory: ./tools/compiler_version + # Print out instruction set support, for debugging. + - name: print instruction set support + run: cargo run --quiet + working-directory: ./tools/instruction_set_support + # Default tests plus Rayon. + - run: cargo test --features=rayon + # no_std tests. + - run: cargo test --no-default-features + + # A matrix of different test settings: + # - debug vs release + # - assembly vs Rust+C intrinsics vs pure Rust intrinsics + # - different levels of SIMD support + # + # Full SIMD support. + - run: cargo test --features= + - run: cargo test --features=prefer_intrinsics + - run: cargo test --features=pure + - run: cargo test --features= --release + - run: cargo test --features=prefer_intrinsics --release + - run: cargo test --features=pure --release + # No AVX-512. + - run: cargo test --features=no_avx512 + - run: cargo test --features=no_avx512,prefer_intrinsics + - run: cargo test --features=no_avx512,pure + - run: cargo test --features=no_avx512 --release + - run: cargo test --features=no_avx512,prefer_intrinsics --release + - run: cargo test --features=no_avx512,pure --release + # No AVX2. + - run: cargo test --features=no_avx512,no_avx2 + - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,pure + - run: cargo test --features=no_avx512,no_avx2 --release + - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,pure --release + # No SSE4.1 + - run: cargo test --features=no_avx512,no_avx2,no_sse41 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure + - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release + # No SSE2 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release + + # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. + - run: cargo test --benches + env: + RUSTC_BOOTSTRAP: 1 + # Test vectors. + - name: test vectors + run: cargo test + working-directory: ./test_vectors + - name: test vectors intrinsics + run: cargo test --features=prefer_intrinsics + working-directory: ./test_vectors + - name: test vectors pure + run: cargo test --features=pure + working-directory: ./test_vectors + # Test b3sum. + - name: test b3sum + run: cargo test + working-directory: ./b3sum + - name: test b3sum --no-default-features + run: cargo test --no-default-features + working-directory: ./b3sum + # Test C code. + - name: cargo test C bindings assembly + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + - name: cargo test C bindings intrinsics + run: cargo test --features=prefer_intrinsics + working-directory: ./c/blake3_c_rust_bindings + # Reference impl doc test. + - name: reference impl doc test + run: cargo test + working-directory: ./reference_impl + + cross_tests: + name: cross ${{ matrix.arch }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + arch: + - i586-unknown-linux-musl + - i686-unknown-linux-musl + - armv7-unknown-linux-gnueabihf + - aarch64-unknown-linux-gnu + - mips-unknown-linux-gnu + + steps: + - uses: actions/checkout@v1 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - run: cargo install cross + # Test the portable implementation on everything. + - run: cross test --target ${{ matrix.arch }} + # Test building for ancient i386 processors without guaranteed SSE2 support. + - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 + if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') + # Test the NEON implementation on ARM targets. + - run: cross test --target ${{ matrix.arch }} --features=neon + if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') + # Test vectors. Note that this uses a hacky script due to path dependency limitations. + - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} + # C code. Same issue with the hacky script. + - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} + - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon + if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') + + # Currently only on x86. + c_tests: + name: C Makefile tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + # Test the intrinsics-based implementations. + - run: make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse2.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse41.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx2.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx512.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test + working-directory: ./c + # Test the assembly implementations. + - run: make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm + working-directory: ./c + # Restore the files we deleted above. + - run: git checkout . + # Build the example. + - run: make -f Makefile.testing example + working-directory: ./c diff --git a/3rdparty/BLAKE3/.github/workflows/tag.yml b/3rdparty/BLAKE3/.github/workflows/tag.yml new file mode 100644 index 000000000..577d4f312 --- /dev/null +++ b/3rdparty/BLAKE3/.github/workflows/tag.yml @@ -0,0 +1,45 @@ +name: publish_b3sum_binaries + +on: + push: + tags: + - "*" + +env: + BLAKE3_CI: "1" + RUSTFLAGS: "-D warnings" + +jobs: + cargo_tests: + name: ${{ matrix.target.name }} + runs-on: ${{ matrix.target.os }} + strategy: + fail-fast: false + matrix: + target: [ + { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, + { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, + { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, + ] + + steps: + - uses: actions/checkout@v1 + - uses: actions/setup-python@v1 + with: + python-version: "3.x" + - run: pip install PyGithub + - run: sudo apt-get install musl-tools + if: matrix.target.os == 'ubuntu-latest' + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + - run: rustup target add ${{ matrix.target.rust-target }} + - name: build b3sum + id: build_b3sum + run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} + - name: upload release asset + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TAG: ${{ github.ref }} + run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} diff --git a/3rdparty/BLAKE3/.github/workflows/upload_github_release_asset.py b/3rdparty/BLAKE3/.github/workflows/upload_github_release_asset.py new file mode 100644 index 000000000..c1cbf518b --- /dev/null +++ b/3rdparty/BLAKE3/.github/workflows/upload_github_release_asset.py @@ -0,0 +1,65 @@ +#! /usr/bin/env python3 + +import github +import os +import sys + +RETRIES = 10 + +g = github.Github(os.environ["GITHUB_TOKEN"]) +tag_name = os.environ["GITHUB_TAG"] +tag_prefix = "refs/tags/" +if tag_name.startswith(tag_prefix): + tag_name = tag_name[len(tag_prefix):] +assert len(sys.argv) == 2 +asset_path = sys.argv[1] +asset_name = os.path.basename(asset_path) + +repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) + +tags = list(repo.get_tags()) + +for tag in tags: + if tag.name == tag_name: + break +else: + raise RuntimeError("no tag named " + repr(tag_name)) + +try: + print("Creating GitHub release for tag " + repr(tag_name) + "...") + repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) +except github.GithubException as github_error: + if github_error.data["errors"][0]["code"] == "already_exists": + print("Release for tag " + repr(tag_name) + " already exists.") + else: + raise + +releases = list(repo.get_releases()) +for release in releases: + if release.tag_name == tag_name: + break +else: + raise RuntimeError("no release for tag " + repr(tag_name)) + +print("Uploading " + repr(asset_path) + "...") +for i in range(RETRIES): + try: + print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) + release.upload_asset(asset_path) + break + except github.GithubException as github_error: + # Unfortunately the asset upload API is flaky. Even worse, it often + # partially succeeds, returning an error to the caller but leaving the + # release in a state where subsequent uploads of the same asset will + # fail with an "already_exists" error. (Though the asset is not visible + # on github.com, so we can't just declare victory and move on.) If we + # detect this case, explicitly delete the asset and continue retrying. + print(github_error) + for asset in release.get_assets(): + if asset.name == asset_name: + print("Found uploaded asset after failure. Deleting...") + asset.delete_asset() +else: + raise RuntimeError("All upload attempts failed.") + +print("Success!") diff --git a/3rdparty/BLAKE3/.gitignore b/3rdparty/BLAKE3/.gitignore new file mode 100644 index 000000000..fa8d85ac5 --- /dev/null +++ b/3rdparty/BLAKE3/.gitignore @@ -0,0 +1,2 @@ +Cargo.lock +target diff --git a/3rdparty/BLAKE3/CONTRIBUTING.md b/3rdparty/BLAKE3/CONTRIBUTING.md new file mode 100644 index 000000000..3a605f255 --- /dev/null +++ b/3rdparty/BLAKE3/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing + +We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. + +## Bug reports + +Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). + +If you report a bug, please: + +* Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). +* Provide information to help us diagnose and ideally reproduce the bug. + +## Patches + +We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. + +If you contribute code and submit a patch, please note the following: + +* We use Rust's stable branch for developing BLAKE3. +* Pull requests should target the `master` branch. +* Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). + +Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: + +```bash +cargo test +``` + +All third-party contributions will be recognized in the list of contributors. + diff --git a/3rdparty/BLAKE3/Cargo.toml b/3rdparty/BLAKE3/Cargo.toml new file mode 100644 index 000000000..3df0fd279 --- /dev/null +++ b/3rdparty/BLAKE3/Cargo.toml @@ -0,0 +1,90 @@ +[package] +name = "blake3" +version = "0.3.7" +authors = ["Jack O'Connor <[email protected]>"] +description = "the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +documentation = "https://docs.rs/blake3" +readme = "README.md" +edition = "2018" + +[features] +default = ["std"] + +# The NEON implementation does not participate in dynamic feature detection, +# which is currently x86-only. If "neon" is on, NEON support is assumed. Note +# that AArch64 always supports NEON, but support on ARMv7 varies. The NEON +# implementation uses C intrinsics and requires a C compiler. +neon = [] + +# This crate uses libstd for std::io trait implementations, and also for +# runtime CPU feature detection. This feature is enabled by default. If you use +# --no-default-features, the only way to use the SIMD implementations in this +# crate is to enable the corresponding instruction sets statically for the +# entire build, with e.g. RUSTFLAGS="-C target-cpu=native". +std = ["digest/std"] + +# The "rayon" feature (defined below as an optional dependency) enables the +# join::RayonJoin type, which can be used with Hasher::update_with_join to +# perform multi-threaded hashing. However, even if this feature is enabled, all +# other APIs remain single-threaded. + +# ---------- Features below this line are for internal testing only. ---------- + +# By default on x86_64, this crate uses Samuel Neves' hand-written assembly +# implementations for SSE4.1, AVX2, and AVX512. (These provide both the best +# runtime performance, and the fastest build times.) And by default on 32-bit +# x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and +# a C intrinsics implementation for AVX-512. In both cases, if a C compiler is +# not detected, or if AVX-512 support is missing from the detected compiler, +# build.rs automatically falls back to a pure Rust build. This feature forces +# that fallback, for testing purposes. (Note that in CI testing, we set the +# BLAKE3_CI environment variable, which instructs build.rs to error out rather +# than doing an automatic fallback.) +pure = [] + +# As described above, on x86_64 this crate use assembly implementations by +# default. Enabling the "prefer_intrinsics" feature makes this crate use +# intrinsics implementations on both 32-bit and 64-bit x86, again for testing +# purposes. +prefer_intrinsics = [] + +# Disable individual instruction sets. CI testing uses these flags to simulate +# different levels of hardware SIMD support. Note that code for the +# corresponding instruction set is still compiled; only detection is disabled. +# +# As noted above, these flags are *for testing only* and are not stable. It's +# possible that some users might find that their particular use case performs +# better if e.g. AVX-512 is disabled, because of issues like CPU downlocking. +# If that comes up, and if disabling the instruction set here at the feature +# level turns out to be the right approach, then we can design a stable +# feature. Until then, we reserve the right to break these features in a patch +# release. +no_sse2 = [] +no_sse41 = [] +no_avx2 = [] +no_avx512 = [] + +[package.metadata.docs.rs] +# Document blake3::join::RayonJoin on docs.rs. +features = ["rayon"] + +[dependencies] +arrayref = "0.3.5" +arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] } +constant_time_eq = "0.1.5" +rayon = { version = "1.2.1", optional = true } +cfg-if = "0.1.10" +digest = "0.9.0" +crypto-mac = "0.8.0" + +[dev-dependencies] +hex = "0.4.2" +page_size = "0.4.1" +rand = "0.7.2" +rand_chacha = "0.2.1" +reference_impl = { path = "./reference_impl" } + +[build-dependencies] +cc = "1.0.4" diff --git a/3rdparty/BLAKE3/LICENSE b/3rdparty/BLAKE3/LICENSE new file mode 100644 index 000000000..f5892efc3 --- /dev/null +++ b/3rdparty/BLAKE3/LICENSE @@ -0,0 +1,330 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + +------------------------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/3rdparty/BLAKE3/README.md b/3rdparty/BLAKE3/README.md new file mode 100644 index 000000000..360183668 --- /dev/null +++ b/3rdparty/BLAKE3/README.md @@ -0,0 +1,202 @@ +# <a href="#"><img src="media/BLAKE3.svg" alt="BLAKE3" height=50></a> + +BLAKE3 is a cryptographic hash function that is: + +- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. +- **Secure**, unlike MD5 and SHA-1. And secure against length extension, + unlike SHA-2. +- **Highly parallelizable** across any number of threads and SIMD lanes, + because it's a Merkle tree on the inside. +- Capable of **verified streaming** and **incremental updates**, again + because it's a Merkle tree. +- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. +- **One algorithm with no variants**, which is fast on x86-64 and also + on smaller architectures. + +The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) +is an example benchmark of 16 KiB inputs on modern server hardware (a Cascade +Lake-SP 8275CL processor). For more detailed benchmarks, see the +[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). + +<p align="center"> +<img src="media/speed.svg" alt="performance graph"> +</p> + +BLAKE3 is based on an optimized instance of the established hash +function [BLAKE2](https://blake2.net) and on the [original Bao tree +mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). +The specifications and design rationale are available in the [BLAKE3 +paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). +The default output size is 256 bits. The current version of +[Bao](https://github.com/oconnor663/bao) implements verified streaming +with BLAKE3. + +This repository is the official implementation of BLAKE3. It includes: + +* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which + includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, + and NEON, with automatic runtime CPU feature detection on x86. The + `rayon` feature provides multithreading. + +* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which + provides a command line interface. It uses multithreading by default, + making it an order of magnitude faster than e.g. `sha256sum` on + typical desktop hardware. + +* The [C implementation](c), which like the Rust implementation includes + SIMD code and runtime CPU feature detection on x86. Unlike the Rust + implementation, it's not currently multithreaded. See + [`c/README.md`](c/README.md). + +* The [reference implementation](reference_impl/reference_impl.rs), + which is discussed in Section 5.1 of the [BLAKE3 + paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). + This implementation is much smaller and simpler than the optimized + ones above. If you want to see how BLAKE3 works, or you're writing a + port that doesn't need multithreading or SIMD optimizations, start + here. + +* A [set of test + vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) + that covers extended outputs, all three modes, and a variety of input + lengths. + +* [](https://github.com/BLAKE3-team/BLAKE3/actions) + +BLAKE3 was designed by: + +* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor) +* [@sneves](https://github.com/sneves) (Samuel Neves) +* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson) +* [@zookozcash](https://github.com/zookozcash) (Zooko) + +The development of BLAKE3 was sponsored by +[Teserakt](https://teserakt.io) and [Electric Coin Company](https://electriccoin.co). + +*NOTE: BLAKE3 is not a password hashing algorithm, because it's +designed to be fast, whereas password hashing should not be fast. If you +hash passwords to store the hashes or if you derive keys from passwords, +we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* + +## Usage + +### The `b3sum` utility + +The `b3sum` command line utility prints the BLAKE3 hashes of files or of +standard input. Prebuilt binaries are available for Linux, Windows, and +macOS (requiring the [unidentified developer +workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) +on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). +If you've [installed Rust and +Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), +you can also build `b3sum` yourself with: + +```bash +cargo install b3sum +``` + +If `rustup` didn't configure your `PATH` for you, you might need to go +looking for the installed binary in e.g. `~/.cargo/bin`. You can test +out how fast BLAKE3 is on your machine by creating a big file and +hashing it, for example: + +```bash +# Create a 1 GB file. +head -c 1000000000 /dev/zero > /tmp/bigfile +# Hash it with SHA-256. +time openssl sha256 /tmp/bigfile +# Hash it with BLAKE3. +time b3sum /tmp/bigfile +``` + +### The `blake3` crate [](https://docs.rs/blake3) + +To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to +your `Cargo.toml`. Here's an example of hashing some input bytes: + +```rust +// Hash an input all at once. +let hash1 = blake3::hash(b"foobarbaz"); + +// Hash an input incrementally. +let mut hasher = blake3::Hasher::new(); +hasher.update(b"foo"); +hasher.update(b"bar"); +hasher.update(b"baz"); +let hash2 = hasher.finalize(); +assert_eq!(hash1, hash2); + +// Extended output. OutputReader also implements Read and Seek. +let mut output = [0; 1000]; +let mut output_reader = hasher.finalize_xof(); +output_reader.fill(&mut output); +assert_eq!(&output[..32], hash1.as_bytes()); + +// Print a hash as hex. +println!("{}", hash1.to_hex()); +``` + +Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and +`derive_key`. The `keyed_hash` mode takes a 256-bit key: + +```rust +// MAC an input all at once. +let example_key = [42u8; 32]; +let mac1 = blake3::keyed_hash(&example_key, b"example input"); + +// MAC incrementally. +let mut hasher = blake3::Hasher::new_keyed(&example_key); +hasher.update(b"example input"); +let mac2 = hasher.finalize(); +assert_eq!(mac1, mac2); +``` + +The `derive_key` mode takes a context string of any length and key +material of any length, and it outputs a derived key of any length. The +context string should be hardcoded, globally unique, and +application-specific. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`: + +```rust +// Derive a couple of subkeys for different purposes. +const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; +const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; +let input_key = b"some very secret key material (>'-')> <('-'<) ^('-')^"; +let mut email_key = [0; 32]; +blake3::derive_key(EMAIL_CONTEXT, input_key, &mut email_key); +let mut api_key = [0; 32]; +blake3::derive_key(API_CONTEXT, input_key, &mut api_key); +assert!(email_key != api_key); +``` + +### The C implementation + +See [`c/README.md`](c/README.md). + +### Other implementations + +We post links to third-party bindings and implementations on the +[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever +we hear about them. Some highlights include [an optimized Go +implementation](https://github.com/zeebo/blake3), [Wasm bindings for +Node.js and browsers](https://github.com/connor4312/blake3), and [binary +wheels for Python](https://github.com/oconnor663/blake3-py). + +## Contributing + +Please see [CONTRIBUTING.md](CONTRIBUTING.md). + +## Intellectual property + +The Rust code is copyright Jack O'Connor, 2019-2020. The C code is +copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code +is copyright Samuel Neves, 2019-2020. + +This work is released into the public domain with CC0 1.0. +Alternatively, it is licensed under the Apache License 2.0. + +## Miscellany + +- [@veorq](https://github.com/veorq) and + [@oconnor663](https://github.com/oconnor663) did [a podcast + interview](https://www.cryptography.fm/3) about designing BLAKE3. diff --git a/3rdparty/BLAKE3/b3sum/Cargo.toml b/3rdparty/BLAKE3/b3sum/Cargo.toml new file mode 100644 index 000000000..4678bee2d --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "b3sum" +version = "0.3.7" +authors = ["Jack O'Connor <[email protected]>"] +description = "a command line implementation of the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +readme = "README.md" +edition = "2018" + +[features] +neon = ["blake3/neon"] +prefer_intrinsics = ["blake3/prefer_intrinsics"] +pure = ["blake3/pure"] + +[dependencies] +anyhow = "1.0.25" +blake3 = { version = "0.3", path = "..", features = ["rayon"] } +clap = "2.33.1" +hex = "0.4.0" +memmap = "0.7.0" +rayon = "1.2.1" +wild = "2.0.3" + +[dev-dependencies] +duct = "0.13.3" +tempfile = "3.1.0" diff --git a/3rdparty/BLAKE3/b3sum/README.md b/3rdparty/BLAKE3/b3sum/README.md new file mode 100644 index 000000000..e97830b7c --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/README.md @@ -0,0 +1,86 @@ +# b3sum + +A command line utility for calculating +[BLAKE3](https://github.com/BLAKE3-team/BLAKE3) hashes, similar to +Coreutils tools like `b2sum` or `md5sum`. + +``` +b3sum 0.3.6 + +USAGE: + b3sum [FLAGS] [OPTIONS] [FILE]... + +FLAGS: + -c, --check Reads BLAKE3 sums from the [file]s and checks them + -h, --help Prints help information + --keyed Uses the keyed mode. The secret key is read from standard + input, and it must be exactly 32 raw bytes. + --no-mmap Disables memory mapping. Currently this also disables + multithreading. + --no-names Omits filenames in the output + --quiet Skips printing OK for each successfully verified file. + Must be used with --check. + --raw Writes raw output bytes to stdout, rather than hex. + --no-names is implied. In this case, only a single + input is allowed. + -V, --version Prints version information + +OPTIONS: + --derive-key <CONTEXT> Uses the key derivation mode, with the given + context string. Cannot be used with --keyed. + -l, --length <LEN> The number of output bytes, prior to hex + encoding (default 32) + --num-threads <NUM> The maximum number of threads to use. By + default, this is the number of logical cores. + If this flag is omitted, or if its value is 0, + RAYON_NUM_THREADS is also respected. + +ARGS: + <FILE>... Files to hash, or checkfiles to check. When no file is given, + or when - is given, read standard input. +``` + +See also [this document about how the `--check` flag +works](https://github.com/BLAKE3-team/BLAKE3/blob/master/b3sum/what_does_check_do.md). + +# Example + +Hash the file `foo.txt`: + +```bash +b3sum foo.txt +``` + +Time hashing a gigabyte of data, to see how fast it is: + +```bash +# Create a 1 GB file. +head -c 1000000000 /dev/zero > /tmp/bigfile +# Hash it with SHA-256. +time openssl sha256 /tmp/bigfile +# Hash it with BLAKE3. +time b3sum /tmp/bigfile +``` + + +# Installation + +Prebuilt binaries are available for Linux, Windows, and macOS (requiring +the [unidentified developer +workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) +on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). +If you've [installed Rust and +Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), +you can also build `b3sum` yourself with: + +``` +cargo install b3sum +``` + +On Linux for example, Cargo will put the compiled binary in +`~/.cargo/bin`. You might want to add that directory to your `$PATH`, or +`rustup` might have done it for you when you installed Cargo. + +If you want to install directly from this directory, you can run `cargo +install --path .`. Or you can just build with `cargo build --release`, +which puts the binary at `./target/release/b3sum`. diff --git a/3rdparty/BLAKE3/b3sum/src/main.rs b/3rdparty/BLAKE3/b3sum/src/main.rs new file mode 100644 index 000000000..b01e5de58 --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/src/main.rs @@ -0,0 +1,621 @@ +use anyhow::{bail, ensure, Context, Result}; +use clap::{App, Arg}; +use std::cmp; +use std::convert::TryInto; +use std::fs::File; +use std::io; +use std::io::prelude::*; +use std::path::{Path, PathBuf}; + +#[cfg(test)] +mod unit_tests; + +const NAME: &str = "b3sum"; + +const FILE_ARG: &str = "FILE"; +const DERIVE_KEY_ARG: &str = "derive-key"; +const KEYED_ARG: &str = "keyed"; +const LENGTH_ARG: &str = "length"; +const NO_MMAP_ARG: &str = "no-mmap"; +const NO_NAMES_ARG: &str = "no-names"; +const NUM_THREADS_ARG: &str = "num-threads"; +const RAW_ARG: &str = "raw"; +const CHECK_ARG: &str = "check"; +const QUIET_ARG: &str = "quiet"; + +struct Args { + inner: clap::ArgMatches<'static>, + file_args: Vec<PathBuf>, + base_hasher: blake3::Hasher, +} + +impl Args { + fn parse() -> Result<Self> { + let inner = App::new(NAME) + .version(env!("CARGO_PKG_VERSION")) + .arg(Arg::with_name(FILE_ARG).multiple(true).help( + "Files to hash, or checkfiles to check. When no file is given,\n\ + or when - is given, read standard input.", + )) + .arg( + Arg::with_name(LENGTH_ARG) + .long(LENGTH_ARG) + .short("l") + .takes_value(true) + .value_name("LEN") + .help( + "The number of output bytes, prior to hex\n\ + encoding (default 32)", + ), + ) + .arg( + Arg::with_name(NUM_THREADS_ARG) + .long(NUM_THREADS_ARG) + .takes_value(true) + .value_name("NUM") + .help( + "The maximum number of threads to use. By\n\ + default, this is the number of logical cores.\n\ + If this flag is omitted, or if its value is 0,\n\ + RAYON_NUM_THREADS is also respected.", + ), + ) + .arg( + Arg::with_name(KEYED_ARG) + .long(KEYED_ARG) + .requires(FILE_ARG) + .help( + "Uses the keyed mode. The secret key is read from standard\n\ + input, and it must be exactly 32 raw bytes.", + ), + ) + .arg( + Arg::with_name(DERIVE_KEY_ARG) + .long(DERIVE_KEY_ARG) + .conflicts_with(KEYED_ARG) + .takes_value(true) + .value_name("CONTEXT") + .help( + "Uses the key derivation mode, with the given\n\ + context string. Cannot be used with --keyed.", + ), + ) + .arg(Arg::with_name(NO_MMAP_ARG).long(NO_MMAP_ARG).help( + "Disables memory mapping. Currently this also disables\n\ + multithreading.", + )) + .arg( + Arg::with_name(NO_NAMES_ARG) + .long(NO_NAMES_ARG) + .help("Omits filenames in the output"), + ) + .arg(Arg::with_name(RAW_ARG).long(RAW_ARG).help( + "Writes raw output bytes to stdout, rather than hex.\n\ + --no-names is implied. In this case, only a single\n\ + input is allowed.", + )) + .arg( + Arg::with_name(CHECK_ARG) + .long(CHECK_ARG) + .short("c") + .conflicts_with(DERIVE_KEY_ARG) + .conflicts_with(KEYED_ARG) + .conflicts_with(LENGTH_ARG) + .conflicts_with(RAW_ARG) + .conflicts_with(NO_NAMES_ARG) + .help("Reads BLAKE3 sums from the [file]s and checks them"), + ) + .arg( + Arg::with_name(QUIET_ARG) + .long(QUIET_ARG) + .requires(CHECK_ARG) + .help( + "Skips printing OK for each successfully verified file.\n\ + Must be used with --check.", + ), + ) + // wild::args_os() is equivalent to std::env::args_os() on Unix, + // but on Windows it adds support for globbing. + .get_matches_from(wild::args_os()); + let file_args = if let Some(iter) = inner.values_of_os(FILE_ARG) { + iter.map(|s| s.into()).collect() + } else { + vec!["-".into()] + }; + if inner.is_present(RAW_ARG) && file_args.len() > 1 { + bail!("Only one filename can be provided when using --raw"); + } + let base_hasher = if inner.is_present(KEYED_ARG) { + // In keyed mode, since stdin is used for the key, we can't handle + // `-` arguments. Input::open handles that case below. + blake3::Hasher::new_keyed(&read_key_from_stdin()?) + } else if let Some(context) = inner.value_of(DERIVE_KEY_ARG) { + blake3::Hasher::new_derive_key(context) + } else { + blake3::Hasher::new() + }; + Ok(Self { + inner, + file_args, + base_hasher, + }) + } + + fn num_threads(&self) -> Result<Option<usize>> { + if let Some(num_threads_str) = self.inner.value_of(NUM_THREADS_ARG) { + Ok(Some( + num_threads_str + .parse() + .context("Failed to parse num threads.")?, + )) + } else { + Ok(None) + } + } + + fn check(&self) -> bool { + self.inner.is_present(CHECK_ARG) + } + + fn raw(&self) -> bool { + self.inner.is_present(RAW_ARG) + } + + fn no_mmap(&self) -> bool { + self.inner.is_present(NO_MMAP_ARG) + } + + fn no_names(&self) -> bool { + self.inner.is_present(NO_NAMES_ARG) + } + + fn len(&self) -> Result<u64> { + if let Some(length) = self.inner.value_of(LENGTH_ARG) { + length.parse::<u64>().context("Failed to parse length.") + } else { + Ok(blake3::OUT_LEN as u64) + } + } + + fn keyed(&self) -> bool { + self.inner.is_present(KEYED_ARG) + } + + fn quiet(&self) -> bool { + self.inner.is_present(QUIET_ARG) + } +} + +enum Input { + Mmap(io::Cursor<memmap::Mmap>), + File(File), + Stdin, +} + +impl Input { + // Open an input file, using mmap if appropriate. "-" means stdin. Note + // that this convention applies both to command line arguments, and to + // filepaths that appear in a checkfile. + fn open(path: &Path, args: &Args) -> Result<Self> { + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); + } + return Ok(Self::Stdin); + } + let file = File::open(path)?; + if !args.no_mmap() { + if let Some(mmap) = maybe_memmap_file(&file)? { + return Ok(Self::Mmap(io::Cursor::new(mmap))); + } + } + Ok(Self::File(file)) + } + + fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> { + let mut hasher = args.base_hasher.clone(); + match self { + // The fast path: If we mmapped the file successfully, hash using + // multiple threads. This doesn't work on stdin, or on some files, + // and it can also be disabled with --no-mmap. + Self::Mmap(cursor) => { + hasher.update_with_join::<blake3::join::RayonJoin>(cursor.get_ref()); + } + // The slower paths, for stdin or files we didn't/couldn't mmap. + // This is currently all single-threaded. Doing multi-threaded + // hashing without memory mapping is tricky, since all your worker + // threads have to stop every time you refill the buffer, and that + // ends up being a lot of overhead. To solve that, we need a more + // complicated double-buffering strategy where a background thread + // fills one buffer while the worker threads are hashing the other + // one. We might implement that in the future, but since this is + // the slow path anyway, it's not high priority. + Self::File(file) => { + copy_wide(file, &mut hasher)?; + } + Self::Stdin => { + let stdin = io::stdin(); + let lock = stdin.lock(); + copy_wide(lock, &mut hasher)?; + } + } + Ok(hasher.finalize_xof()) + } +} + +impl Read for Input { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + match self { + Self::Mmap(cursor) => cursor.read(buf), + Self::File(file) => file.read(buf), + Self::Stdin => io::stdin().read(buf), + } + } +} + +// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets +// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms +// can support at least 64 KiB, and there's some performance benefit to using +// bigger reads, so that's what we use here. +fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we +// know mmap will fail, or if the file is short enough that mmapping isn't +// worth it. However, if we do try to mmap and it fails, return the error. +fn maybe_memmap_file(file: &File) -> Result<Option<memmap::Mmap>> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + Ok(if !metadata.is_file() { + // Not a real file. + None + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + None + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + None + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + None + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap::MmapOptions::new() + .len(file_size as usize) + .map(&file)? + }; + Some(map) + }) +} + +fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { + // Encoding multiples of the block size is most efficient. + let mut len = args.len()?; + let mut block = [0; blake3::BLOCK_LEN]; + while len > 0 { + output.fill(&mut block); + let hex_str = hex::encode(&block[..]); + let take_bytes = cmp::min(len, block.len() as u64); + print!("{}", &hex_str[..2 * take_bytes as usize]); + len -= take_bytes; + } + Ok(()) +} + +fn write_raw_output(output: blake3::OutputReader, args: &Args) -> Result<()> { + let mut output = output.take(args.len()?); + let stdout = std::io::stdout(); + let mut handler = stdout.lock(); + std::io::copy(&mut output, &mut handler)?; + + Ok(()) +} + +fn read_key_from_stdin() -> Result<[u8; blake3::KEY_LEN]> { + let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1); + let n = std::io::stdin() + .lock() + .take(blake3::KEY_LEN as u64 + 1) + .read_to_end(&mut bytes)?; + if n < 32 { + bail!( + "expected {} key bytes from stdin, found {}", + blake3::KEY_LEN, + n, + ) + } else if n > 32 { + bail!("read more than {} key bytes from stdin", blake3::KEY_LEN) + } else { + Ok(bytes[..blake3::KEY_LEN].try_into().unwrap()) + } +} + +struct FilepathString { + filepath_string: String, + is_escaped: bool, +} + +// returns (string, did_escape) +fn filepath_to_string(filepath: &Path) -> FilepathString { + let unicode_cow = filepath.to_string_lossy(); + let mut filepath_string = unicode_cow.to_string(); + // If we're on Windows, normalize backslashes to forward slashes. This + // avoids a lot of ugly escaping in the common case, and it makes + // checkfiles created on Windows more likely to be portable to Unix. It + // also allows us to set a blanket "no backslashes allowed in checkfiles on + // Windows" rule, rather than allowing a Unix backslash to potentially get + // interpreted as a directory separator on Windows. + if cfg!(windows) { + filepath_string = filepath_string.replace('\\', "/"); + } + let mut is_escaped = false; + if filepath_string.contains('\\') || filepath_string.contains('\n') { + filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n"); + is_escaped = true; + } + FilepathString { + filepath_string, + is_escaped, + } +} + +fn hex_half_byte(c: char) -> Result<u8> { + // The hex characters in the hash must be lowercase for now, though we + // could support uppercase too if we wanted to. + if '0' <= c && c <= '9' { + return Ok(c as u8 - '0' as u8); + } + if 'a' <= c && c <= 'f' { + return Ok(c as u8 - 'a' as u8 + 10); + } + bail!("Invalid hex"); +} + +// The `check` command is a security tool. That means it's much better for a +// check to fail more often than it should (a false negative), than for a check +// to ever succeed when it shouldn't (a false positive). By forbidding certain +// characters in checked filepaths, we avoid a class of false positives where +// two different filepaths can get confused with each other. +fn check_for_invalid_characters(utf8_path: &str) -> Result<()> { + // Null characters in paths should never happen, but they can result in a + // path getting silently truncated on Unix. + if utf8_path.contains('\0') { + bail!("Null character in path"); + } + // Because we convert invalid UTF-8 sequences in paths to the Unicode + // replacement character, multiple different invalid paths can map to the + // same UTF-8 string. + if utf8_path.contains('�') { + bail!("Unicode replacement character in path"); + } + // We normalize all Windows backslashes to forward slashes in our output, + // so the only natural way to get a backslash in a checkfile on Windows is + // to construct it on Unix and copy it over. (Or of course you could just + // doctor it by hand.) To avoid confusing this with a directory separator, + // we forbid backslashes entirely on Windows. Note that this check comes + // after unescaping has been done. + if cfg!(windows) && utf8_path.contains('\\') { + bail!("Backslash in path"); + } + Ok(()) +} + +fn unescape(mut path: &str) -> Result<String> { + let mut unescaped = String::with_capacity(2 * path.len()); + while let Some(i) = path.find('\\') { + ensure!(i < path.len() - 1, "Invalid backslash escape"); + unescaped.push_str(&path[..i]); + match path[i + 1..].chars().next().unwrap() { + // Anything other than a recognized escape sequence is an error. + 'n' => unescaped.push_str("\n"), + '\\' => unescaped.push_str("\\"), + _ => bail!("Invalid backslash escape"), + } + path = &path[i + 2..]; + } + unescaped.push_str(path); + Ok(unescaped) +} + +#[derive(Debug)] +struct ParsedCheckLine { + file_string: String, + is_escaped: bool, + file_path: PathBuf, + expected_hash: blake3::Hash, +} + +fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> { + // Trim off the trailing newline, if any. + line = line.trim_end_matches('\n'); + // If there's a backslash at the front of the line, that means we need to + // unescape the path below. This matches the behavior of e.g. md5sum. + let first = if let Some(c) = line.chars().next() { + c + } else { + bail!("Empty line"); + }; + let mut is_escaped = false; + if first == '\\' { + is_escaped = true; + line = &line[1..]; + } + // The front of the line must be a hash of the usual length, followed by + // two spaces. The hex characters in the hash must be lowercase for now, + // though we could support uppercase too if we wanted to. + let hash_hex_len = 2 * blake3::OUT_LEN; + let num_spaces = 2; + let prefix_len = hash_hex_len + num_spaces; + ensure!(line.len() > prefix_len, "Short line"); + ensure!( + line.chars().take(prefix_len).all(|c| c.is_ascii()), + "Non-ASCII prefix" + ); + ensure!(&line[hash_hex_len..][..2] == " ", "Invalid space"); + // Decode the hash hex. + let mut hash_bytes = [0; blake3::OUT_LEN]; + let mut hex_chars = line[..hash_hex_len].chars(); + for byte in &mut hash_bytes { + let high_char = hex_chars.next().unwrap(); + let low_char = hex_chars.next().unwrap(); + *byte = 16 * hex_half_byte(high_char)? + hex_half_byte(low_char)?; + } + let expected_hash: blake3::Hash = hash_bytes.into(); + let file_string = line[prefix_len..].to_string(); + let file_path_string = if is_escaped { + // If we detected a backslash at the start of the line earlier, now we + // need to unescape backslashes and newlines. + unescape(&file_string)? + } else { + file_string.clone().into() + }; + check_for_invalid_characters(&file_path_string)?; + Ok(ParsedCheckLine { + file_string, + is_escaped, + file_path: file_path_string.into(), + expected_hash, + }) +} + +fn hash_one_input(path: &Path, args: &Args) -> Result<()> { + let mut input = Input::open(path, args)?; + let output = input.hash(args)?; + if args.raw() { + write_raw_output(output, args)?; + return Ok(()); + } + if args.no_names() { + write_hex_output(output, args)?; + println!(); + return Ok(()); + } + let FilepathString { + filepath_string, + is_escaped, + } = filepath_to_string(path); + if is_escaped { + print!("\\"); + } + write_hex_output(output, args)?; + println!(" {}", filepath_string); + Ok(()) +} + +// Returns true for success. Having a boolean return value here, instead of +// passing down the some_file_failed reference, makes it less likely that we +// might forget to set it in some error condition. +fn check_one_line(line: &str, args: &Args) -> bool { + let parse_result = parse_check_line(&line); + let ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = match parse_result { + Ok(parsed) => parsed, + Err(e) => { + eprintln!("{}: {}", NAME, e); + return false; + } + }; + let file_string = if is_escaped { + "\\".to_string() + &file_string + } else { + file_string + }; + let hash_result: Result<blake3::Hash> = Input::open(&file_path, args) + .and_then(|mut input| input.hash(args)) + .map(|mut hash_output| { + let mut found_hash_bytes = [0; blake3::OUT_LEN]; + hash_output.fill(&mut found_hash_bytes); + found_hash_bytes.into() + }); + let found_hash: blake3::Hash = match hash_result { + Ok(hash) => hash, + Err(e) => { + println!("{}: FAILED ({})", file_string, e); + return false; + } + }; + // This is a constant-time comparison. + if expected_hash == found_hash { + if !args.quiet() { + println!("{}: OK", file_string); + } + true + } else { + println!("{}: FAILED", file_string); + false + } +} + +fn check_one_checkfile(path: &Path, args: &Args, some_file_failed: &mut bool) -> Result<()> { + let checkfile_input = Input::open(path, args)?; + let mut bufreader = io::BufReader::new(checkfile_input); + let mut line = String::new(); + loop { + line.clear(); + let n = bufreader.read_line(&mut line)?; + if n == 0 { + return Ok(()); + } + // check_one_line() prints errors and turns them into a success=false + // return, so it doesn't return a Result. + let success = check_one_line(&line, args); + if !success { + *some_file_failed = true; + } + } +} + +fn main() -> Result<()> { + let args = Args::parse()?; + let mut thread_pool_builder = rayon::ThreadPoolBuilder::new(); + if let Some(num_threads) = args.num_threads()? { + thread_pool_builder = thread_pool_builder.num_threads(num_threads); + } + let thread_pool = thread_pool_builder.build()?; + thread_pool.install(|| { + let mut some_file_failed = false; + // Note that file_args automatically includes `-` if nothing is given. + for path in &args.file_args { + if args.check() { + // A hash mismatch or a failure to read a hashed file will be + // printed in the checkfile loop, and will not propagate here. + // This is similar to the explicit error handling we do in the + // hashing case immediately below. In these cases, + // some_file_failed will be set to false. + check_one_checkfile(path, &args, &mut some_file_failed)?; + } else { + // Errors encountered in hashing are tolerated and printed to + // stderr. This allows e.g. `b3sum *` to print errors for + // non-files and keep going. However, if we encounter any + // errors we'll still return non-zero at the end. + let result = hash_one_input(path, &args); + if let Err(e) = result { + some_file_failed = true; + eprintln!("{}: {}: {}", NAME, path.to_string_lossy(), e); + } + } + } + std::process::exit(if some_file_failed { 1 } else { 0 }); + }) +} diff --git a/3rdparty/BLAKE3/b3sum/src/unit_tests.rs b/3rdparty/BLAKE3/b3sum/src/unit_tests.rs new file mode 100644 index 000000000..1fa1a17dc --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/src/unit_tests.rs @@ -0,0 +1,189 @@ +use std::path::Path; + +#[test] +fn test_parse_check_line() { + // ========================= + // ===== Success Cases ===== + // ========================= + + // the basic case + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "0909090909090909090909090909090909090909090909090909090909090909 foo", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x09; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "foo"); + assert_eq!(file_path, Path::new("foo")); + + // regular whitespace + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa fo \to\n\n\n", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "fo \to"); + assert_eq!(file_path, Path::new("fo \to")); + + // path is one space + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4242424242424242424242424242424242424242424242424242424242424242 ", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x42; 32])); + assert!(!is_escaped); + assert_eq!(file_string, " "); + assert_eq!(file_path, Path::new(" ")); + + // *Unescaped* backslashes. Note that this line does *not* start with a + // backslash, so something like "\" + "n" is interpreted as *two* + // characters. We forbid all backslashes on Windows, so this test is + // Unix-only. + if cfg!(not(windows)) { + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4343434343434343434343434343434343434343434343434343434343434343 fo\\a\\no", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x43; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "fo\\a\\no"); + assert_eq!(file_path, Path::new("fo\\a\\no")); + } + + // escaped newline + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "\\4444444444444444444444444444444444444444444444444444444444444444 fo\\n\\no", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x44; 32])); + assert!(is_escaped); + assert_eq!(file_string, "fo\\n\\no"); + assert_eq!(file_path, Path::new("fo\n\no")); + + // Escaped newline and backslash. Again because backslash is not allowed on + // Windows, this test is Unix-only. + if cfg!(not(windows)) { + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "\\4545454545454545454545454545454545454545454545454545454545454545 fo\\n\\\\o", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x45; 32])); + assert!(is_escaped); + assert_eq!(file_string, "fo\\n\\\\o"); + assert_eq!(file_path, Path::new("fo\n\\o")); + } + + // non-ASCII path + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4646464646464646464646464646464646464646464646464646464646464646 否认", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x46; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "否认"); + assert_eq!(file_path, Path::new("否认")); + + // ========================= + // ===== Failure Cases ===== + // ========================= + + // too short + crate::parse_check_line("").unwrap_err(); + crate::parse_check_line("0").unwrap_err(); + crate::parse_check_line("00").unwrap_err(); + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000") + .unwrap_err(); + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 ") + .unwrap_err(); + + // not enough spaces + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 foo") + .unwrap_err(); + + // capital letter hex + crate::parse_check_line( + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA foo", + ) + .unwrap_err(); + + // non-hex hex + crate::parse_check_line( + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx foo", + ) + .unwrap_err(); + + // non-ASCII hex + crate::parse_check_line("你好, 我叫杰克. 认识你很高兴. 要不要吃个香蕉? foo").unwrap_err(); + + // invalid escape sequence + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\o", + ) + .unwrap_err(); + + // truncated escape sequence + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 foo\\", + ) + .unwrap_err(); + + // null char + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo\0o", + ) + .unwrap_err(); + + // Unicode replacement char + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo�o", + ) + .unwrap_err(); + + // On Windows only, backslashes are not allowed, escaped or otherwise. + if cfg!(windows) { + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo\\o", + ) + .unwrap_err(); + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\\\o", + ) + .unwrap_err(); + } +} diff --git a/3rdparty/BLAKE3/b3sum/tests/cli_tests.rs b/3rdparty/BLAKE3/b3sum/tests/cli_tests.rs new file mode 100644 index 000000000..51fbbba98 --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/tests/cli_tests.rs @@ -0,0 +1,552 @@ +use duct::cmd; +use std::ffi::OsString; +use std::fs; +use std::io::prelude::*; +use std::path::PathBuf; + +pub fn b3sum_exe() -> PathBuf { + env!("CARGO_BIN_EXE_b3sum").into() +} + +#[test] +fn test_hash_one() { + let expected = format!("{} -", blake3::hash(b"foo").to_hex()); + let output = cmd!(b3sum_exe()).stdin_bytes("foo").read().unwrap(); + assert_eq!(&*expected, output); +} + +#[test] +fn test_hash_one_raw() { + let expected = blake3::hash(b"foo").as_bytes().to_owned(); + let output = cmd!(b3sum_exe(), "--raw") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected, output.as_slice()); +} + +#[test] +fn test_hash_many() { + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let output = cmd!(b3sum_exe(), &file1, &file2).read().unwrap(); + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + let expected = format!( + "{} {}\n{} {}", + foo_hash.to_hex(), + // account for slash normalization on Windows + file1.to_string_lossy().replace("\\", "/"), + bar_hash.to_hex(), + file2.to_string_lossy().replace("\\", "/"), + ); + assert_eq!(expected, output); + + let output_no_names = cmd!(b3sum_exe(), "--no-names", &file1, &file2) + .read() + .unwrap(); + let expected_no_names = format!("{}\n{}", foo_hash.to_hex(), bar_hash.to_hex(),); + assert_eq!(expected_no_names, output_no_names); +} + +#[test] +fn test_missing_files() { + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let output = cmd!(b3sum_exe(), "file1", "missing_file", "file2") + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + assert!(!output.status.success()); + + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + let expected_stdout = format!( + "{} file1\n{} file2\n", + foo_hash.to_hex(), + bar_hash.to_hex(), + ); + assert_eq!(expected_stdout.as_bytes(), &output.stdout[..]); + + let bing_error = fs::File::open(dir.path().join("missing_file")).unwrap_err(); + let expected_stderr = format!("b3sum: missing_file: {}\n", bing_error.to_string()); + assert_eq!(expected_stderr.as_bytes(), &output.stderr[..]); +} + +#[test] +fn test_hash_length() { + let mut buf = [0; 100]; + blake3::Hasher::new() + .update(b"foo") + .finalize_xof() + .fill(&mut buf); + let expected = format!("{} -", hex::encode(&buf[..])); + let output = cmd!(b3sum_exe(), "--length=100") + .stdin_bytes("foo") + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_keyed() { + let key = [42; blake3::KEY_LEN]; + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"foo").unwrap(); + f.as_file().flush().unwrap(); + let expected = blake3::keyed_hash(&key, b"foo").to_hex(); + let output = cmd!(b3sum_exe(), "--keyed", "--no-names", f.path()) + .stdin_bytes(&key[..]) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_derive_key() { + let context = "BLAKE3 2019-12-28 10:28:41 example context"; + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"key material").unwrap(); + f.as_file().flush().unwrap(); + let mut derive_key_out = [0; blake3::OUT_LEN]; + blake3::derive_key(context, b"key material", &mut derive_key_out); + let expected = hex::encode(&derive_key_out); + let output = cmd!(b3sum_exe(), "--derive-key", context, "--no-names", f.path()) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_no_mmap() { + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"foo").unwrap(); + f.as_file().flush().unwrap(); + + let expected = blake3::hash(b"foo").to_hex(); + let output = cmd!(b3sum_exe(), "--no-mmap", "--no-names", f.path()) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_length_without_value_is_an_error() { + let result = cmd!(b3sum_exe(), "--length") + .stdin_bytes("foo") + .stderr_capture() + .run(); + assert!(result.is_err()); +} + +#[test] +fn test_raw_with_multi_files_is_an_error() { + let f1 = tempfile::NamedTempFile::new().unwrap(); + let f2 = tempfile::NamedTempFile::new().unwrap(); + + // Make sure it doesn't error with just one file + let result = cmd!(b3sum_exe(), "--raw", f1.path()).stdout_capture().run(); + assert!(result.is_ok()); + + // Make sure it errors when both file are passed + let result = cmd!(b3sum_exe(), "--raw", f1.path(), f2.path()) + .stderr_capture() + .run(); + assert!(result.is_err()); +} + +#[test] +#[cfg(unix)] +fn test_newline_and_backslash_escaping_on_unix() { + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::create_dir(dir.path().join("subdir")).unwrap(); + let names = [ + "abcdef", + "abc\ndef", + "abc\\def", + "abc\rdef", + "abc\r\ndef", + "subdir/foo", + ]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +\\{0} abc\\ndef +\\{0} abc\\\\def +{0} abc\rdef +\\{0} abc\r\\ndef +{0} subdir/foo", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(windows)] +fn test_slash_normalization_on_windows() { + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::create_dir(dir.path().join("subdir")).unwrap(); + // Note that filenames can't contain newlines or backslashes on Windows, so + // we don't test escaping here. We only test forward slash and backslash as + // directory separators. + let names = ["abcdef", "subdir/foo", "subdir\\bar"]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} subdir/foo +{0} subdir/bar", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(unix)] +fn test_invalid_unicode_on_unix() { + use std::os::unix::ffi::OsStringExt; + + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + let names = ["abcdef".into(), OsString::from_vec(b"abc\xffdef".to_vec())]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + // Note: Some operating systems, macOS in particular, simply don't + // allow invalid Unicode in filenames. On those systems, this write + // will fail. That's fine, we'll just short-circuit this test in that + // case. But assert that at least Linux allows this. + let write_result = fs::write(&path, b""); + if cfg!(target_os = "linux") { + write_result.expect("Linux should allow invalid Unicode"); + } else if write_result.is_err() { + return; + } + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} abc�def", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(windows)] +fn test_invalid_unicode_on_windows() { + use std::os::windows::ffi::OsStringExt; + + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + let surrogate_char = 0xDC00; + let bad_unicode_wchars = [ + 'a' as u16, + 'b' as u16, + 'c' as u16, + surrogate_char, + 'd' as u16, + 'e' as u16, + 'f' as u16, + ]; + let bad_osstring = OsString::from_wide(&bad_unicode_wchars); + let names = ["abcdef".into(), bad_osstring]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} abc�def", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +fn test_check() { + // Make a directory full of files, and make sure the b3sum output in that + // directory is what we expect. + let a_hash = blake3::hash(b"a").to_hex(); + let b_hash = blake3::hash(b"b").to_hex(); + let cd_hash = blake3::hash(b"cd").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("a"), b"a").unwrap(); + fs::write(dir.path().join("b"), b"b").unwrap(); + fs::create_dir(dir.path().join("c")).unwrap(); + fs::write(dir.path().join("c/d"), b"cd").unwrap(); + let output = cmd!(b3sum_exe(), "a", "b", "c/d") + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_checkfile = format!( + "{} a\n\ + {} b\n\ + {} c/d\n", + a_hash, b_hash, cd_hash, + ); + assert_eq!(expected_checkfile, stdout); + assert_eq!("", stderr); + + // Now use the output we just validated as a checkfile, passed to stdin. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes(expected_checkfile.as_bytes()) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_output = "\ + a: OK\n\ + b: OK\n\ + c/d: OK\n"; + assert_eq!(expected_check_output, stdout); + assert_eq!("", stderr); + + // Now pass the same checkfile twice on the command line just for fun. + let checkfile_path = dir.path().join("checkfile"); + fs::write(&checkfile_path, &expected_checkfile).unwrap(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path, &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let mut double_check_output = String::new(); + double_check_output.push_str(&expected_check_output); + double_check_output.push_str(&expected_check_output); + assert_eq!(double_check_output, stdout); + assert_eq!("", stderr); + + // Corrupt one of the files and check again. + fs::write(dir.path().join("b"), b"CORRUPTION").unwrap(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = "\ + a: OK\n\ + b: FAILED\n\ + c/d: OK\n"; + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!("", stderr); + + // Delete one of the files and check again. + fs::remove_file(dir.path().join("b")).unwrap(); + let open_file_error = fs::File::open(dir.path().join("b")).unwrap_err(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = format!( + "a: OK\n\ + b: FAILED ({})\n\ + c/d: OK\n", + open_file_error, + ); + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!("", stderr); + + // Confirm that --quiet suppresses the OKs but not the FAILEDs. + let output = cmd!(b3sum_exe(), "--check", "--quiet", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = format!("b: FAILED ({})\n", open_file_error); + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!("", stderr); +} + +#[test] +fn test_check_invalid_characters() { + // Check that a null character in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \0") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!("b3sum: Null character in path\n", stderr); + + // Check that a Unicode replacement character in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 �") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!("b3sum: Unicode replacement character in path\n", stderr); + + // Check that an invalid escape sequence in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("\\0000000000000000000000000000000000000000000000000000000000000000 \\a") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!("b3sum: Invalid backslash escape\n", stderr); + + // Windows also forbids literal backslashes. Check for that if and only if + // we're on Windows. + if cfg!(windows) { + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \\") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!("b3sum: Backslash in path\n", stderr); + } +} + +#[test] +fn test_globbing() { + // On Unix, globbing is provided by the shell. On Windows, globbing is + // provided by us, using the `wild` crate. + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + // NOTE: This assumes that the glob will be expanded in alphabetical order, + // to "file1 file2" rather than "file2 file1". So far, this seems to + // be true (guaranteed?) of Unix shell behavior, and true in practice + // with the `wild` crate on Windows. It's possible that this could + // start failing in the future, though, or on some unknown platform. + // If that ever happens, we'll need to relax this test somehow, + // probably by just testing for both possible outputs. I'm not + // handling that case in advance, though, because I'd prefer to hear + // about it if it comes up. + let expected = format!("{} file1\n{} file2", foo_hash.to_hex(), bar_hash.to_hex()); + + let star_command = format!("{} *", b3sum_exe().to_str().unwrap()); + let (exe, c_flag) = if cfg!(windows) { + ("cmd.exe", "/C") + } else { + ("/bin/sh", "-c") + }; + let output = cmd!(exe, c_flag, star_command) + .dir(dir.path()) + .read() + .unwrap(); + assert_eq!(expected, output); +} diff --git a/3rdparty/BLAKE3/b3sum/what_does_check_do.md b/3rdparty/BLAKE3/b3sum/what_does_check_do.md new file mode 100644 index 000000000..3a44a0010 --- /dev/null +++ b/3rdparty/BLAKE3/b3sum/what_does_check_do.md @@ -0,0 +1,174 @@ +# How does `b3sum --check` behave exactly?<br>or: Are filepaths...text? + +Most of the time, `b3sum --check` is a drop-in replacement for `md5sum --check` +and other Coreutils hashing tools. It consumes a checkfile (the output of a +regular `b3sum` command), re-hashes all the files listed there, and returns +success if all of those hashes are still correct. What makes this more +complicated than it might seem, is that representing filepaths as text means we +need to consider many possible edge cases of unrepresentable filepaths. This +document describes all of these edge cases in detail. + +## The simple case + +Here's the result of running `b3sum a b c/d` in a directory that contains +those three files: + +```bash +$ echo hi > a +$ echo lo > b +$ mkdir c +$ echo stuff > c/d +$ b3sum a b c/d +0b8b60248fad7ac6dfac221b7e01a8b91c772421a15b387dd1fb2d6a94aee438 a +6ae4a57bbba24f79c461d30bcb4db973b9427d9207877e34d2d74528daa84115 b +2d477356c962e54784f1c5dc5297718d92087006f6ee96b08aeaf7f3cd252377 c/d +``` + +If we pipe that output into `b3sum --check`, it will exit with status zero +(success) and print: + +```bash +$ b3sum a b c/d | b3sum --check +a: OK +b: OK +c/d: OK +``` + +If we delete `b` and change the contents of `c/d`, and then use the same +checkfile as above, `b3sum --check` will exit with a non-zero status (failure) +and print: + +```bash +$ b3sum a b c/d > checkfile +$ rm b +$ echo more stuff >> c/d +$ b3sum --check checkfile +a: OK +b: FAILED (No such file or directory (os error 2)) +c/d: FAILED +``` + +In these typical cases, `b3sum` and `md5sum` have identical output for success +and very similar output for failure. + +## Escaping newlines and backslashes + +Since the checkfile format (the regular output format of `b3sum`) is +newline-separated text, we need to worry about what happens when a filepath +contains a newline, or worse. Suppose we create a file named `x[newline]x` +(3 characters). One way to create such a file is with a Python one-liner like +this: + +```python +>>> open("x\nx", "w") +``` + +Here's what happens when we hash that file with `b3sum`: + +```bash +$ b3sum x* +\af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 x\nx +``` + +Notice two things. First, `b3sum` puts a single `\` character at the front of +the line. This indicates that the filepath contains escape sequences that +`b3sum --check` will need to unescape. Then, `b3sum` replaces the newline +character in the filepath with the two-character escape sequence `\n`. +Similarly, if the filepath contained a backslash, `b3sum` would escape it as +`\\` in the output. So far, all of this behavior is still identical to +`md5sum`. + +## Invalid Unicode + +This is where `b3sum` and `md5um` diverge. Apart from the newline and backslash +escapes described above, `md5sum` copies all other filepath bytes verbatim to +its output. That means its output encoding is "ASCII plus whatever bytes we got +from the command line". This creates two problems: + +1. Printing something that isn't UTF-8 is kind of gross. +2. Windows support. + +What's the deal with Windows? To start with, there's a fundamental difference +in how Unix and Windows represent filepaths. Unix filepaths are "usually UTF-8" +and Windows filepaths are "usually UTF-16". That means that a file named `abc` +is typically represented as the bytes `[97, 98, 99]` on Unix and as the bytes +`[97, 0, 98, 0, 99, 0]` on Windows. The `md5sum` approach won't work if we plan +on creating a checkfile on Unix and checking it on Windows, or vice versa. + +A more portable approach is to convert platform-specific bytes into some +consistent Unicode encoding. (In practice this is going to be UTF-8, but in +theory it could be anything.) Then when `--check` needs to open a file, we +convert the Unicode representation back into platform-specific bytes. This +makes important common cases like `abc`, and in fact even `abc[newline]def`, +work as expected. Great! + +But...what did we mean above when we said *usually* UTF-8 and *usually* UTF-16? +It turns out that not every possible sequence of bytes is valid UTF-8, and not +every possible sequence of 16-bit wide chars is valid UTF-16. For example, the +byte 0xFF (255) can never appear in any UTF-8 string. If we ask Python to +decode it, it yells at us: + +```python +>>> b"\xFF".decode("UTF-8") +UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte +``` + +However, tragically, we *can* create a file with that byte in its name (on +Linux at least, though not usually on macOS): + +```python +>>> open(b"y\xFFy", "w") +``` + +So some filepaths aren't representable in Unicode at all. Our plan to "convert +platform-specific bytes into some consistent Unicode encoding" isn't going to +work for everything. What does `b3sum` do with the file above? + +```bash +$ b3sum y* +af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 y�y +``` + +That � in there is a "Unicode replacement character". When we run into +filepaths that we can't represent in Unicode, we replace the unrepresentable +parts with these characters. On the checking side, to avoid any possible +confusion between two different invalid filepaths, we automatically fail if we +see a replacement character. Together with a few more details covered in the +next section, this gives us an important set of properties: + +1. Any file can be hashed locally. +2. Any file with a valid Unicode name not containing the � character can be + checked. +3. Checking ambiguous or unrepresentable filepaths always fails. +4. Checkfiles are always valid UTF-8. +5. Checkfiles are portable between Unix and Windows. + +## Formal Rules + +1. When hashing, filepaths are represented in a platform-specific encoding, + which can accommodate any filepath on the current platform. In Rust, this is + `OsStr`/`OsString`. +2. In output, filepaths are first converted to UTF-8. Any non-Unicode segments + are replaced with Unicode replacement characters (U+FFFD). In Rust, this is + `OsStr::to_string_lossy`. +3. Then, if a filepath contains any backslashes (U+005C) or newlines (U+000A), + these characters are escaped as `\\` and `\n` respectively. +4. Finally, any output line containing an escape sequence is prefixed with a + single backslash. +5. When checking, each line is parsed as UTF-8, separated by a newline + (U+000A). Invalid UTF-8 is an error. +6. Then, if a line begins with a backslash, the filepath component is + unescaped. Any escape sequence other than `\\` or `\n` is an error. If a + line does not begin with a backslash, unescaping is not performed, and any + backslashes in the filepath component are interpreted literally. (`b3sum` + output never contains unescaped backslashes, but they can occur in + checkfiles assembled by hand.) +7. Finally, if a filepath contains a Unicode replacement character (U+FFFD) or + a null character (U+0000), it is an error. + + **Additionally, on Windows only:** + +8. In output, all backslashes (U+005C) are replaced with forward slashes + (U+002F). +9. When checking, after unescaping, if a filepath contains a backslash, it is + an error. diff --git a/3rdparty/BLAKE3/benches/bench.rs b/3rdparty/BLAKE3/benches/bench.rs new file mode 100644 index 000000000..ba5a4041f --- /dev/null +++ b/3rdparty/BLAKE3/benches/bench.rs @@ -0,0 +1,520 @@ +#![feature(test)] + +extern crate test; + +use arrayref::array_ref; +use arrayvec::ArrayVec; +use blake3::platform::{Platform, MAX_SIMD_DEGREE}; +use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use rand::prelude::*; +use test::Bencher; + +const KIB: usize = 1024; + +// This struct randomizes two things: +// 1. The actual bytes of input. +// 2. The page offset the input starts at. +pub struct RandomInput { + buf: Vec<u8>, + len: usize, + offsets: Vec<usize>, + offset_index: usize, +} + +impl RandomInput { + pub fn new(b: &mut Bencher, len: usize) -> Self { + b.bytes += len as u64; + let page_size: usize = page_size::get(); + let mut buf = vec![0u8; len + page_size]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut buf); + let mut offsets: Vec<usize> = (0..page_size).collect(); + offsets.shuffle(&mut rng); + Self { + buf, + len, + offsets, + offset_index: 0, + } + } + + pub fn get(&mut self) -> &[u8] { + let offset = self.offsets[self.offset_index]; + self.offset_index += 1; + if self.offset_index >= self.offsets.len() { + self.offset_index = 0; + } + &self.buf[offset..][..self.len] + } +} + +fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); +} + +#[bench] +fn bench_single_compression_portable(b: &mut Bencher) { + bench_single_compression_fn(b, Platform::portable()); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_single_compression_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_single_compression_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_single_compression_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_single_compression_fn(b, platform); + } +} + +fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, CHUNK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::Yes, + 0, + 0, + 0, + &mut out, + ); + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_avx2(b: &mut Bencher) { + if let Some(platform) = Platform::avx2() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_many_chunks_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_chunks_neon(b: &mut Bencher) { + if let Some(platform) = Platform::neon() { + bench_many_chunks_fn(b, platform); + } +} + +// TODO: When we get const generics we can unify this with the chunks code. +fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, BLOCK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::No, + 0, + 0, + 0, + &mut out, + ); + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_avx2(b: &mut Bencher) { + if let Some(platform) = Platform::avx2() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_many_parents_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_parents_neon(b: &mut Bencher) { + if let Some(platform) = Platform::neon() { + bench_many_parents_fn(b, platform); + } +} + +fn bench_atonce(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| blake3::hash(input.get())); +} + +#[bench] +fn bench_atonce_0001_block(b: &mut Bencher) { + bench_atonce(b, BLOCK_LEN); +} + +#[bench] +fn bench_atonce_0001_kib(b: &mut Bencher) { + bench_atonce(b, 1 * KIB); +} + +#[bench] +fn bench_atonce_0002_kib(b: &mut Bencher) { + bench_atonce(b, 2 * KIB); +} + +#[bench] +fn bench_atonce_0004_kib(b: &mut Bencher) { + bench_atonce(b, 4 * KIB); +} + +#[bench] +fn bench_atonce_0008_kib(b: &mut Bencher) { + bench_atonce(b, 8 * KIB); +} + +#[bench] +fn bench_atonce_0016_kib(b: &mut Bencher) { + bench_atonce(b, 16 * KIB); +} + +#[bench] +fn bench_atonce_0032_kib(b: &mut Bencher) { + bench_atonce(b, 32 * KIB); +} + +#[bench] +fn bench_atonce_0064_kib(b: &mut Bencher) { + bench_atonce(b, 64 * KIB); +} + +#[bench] +fn bench_atonce_0128_kib(b: &mut Bencher) { + bench_atonce(b, 128 * KIB); +} + +#[bench] +fn bench_atonce_0256_kib(b: &mut Bencher) { + bench_atonce(b, 256 * KIB); +} + +#[bench] +fn bench_atonce_0512_kib(b: &mut Bencher) { + bench_atonce(b, 512 * KIB); +} + +#[bench] +fn bench_atonce_1024_kib(b: &mut Bencher) { + bench_atonce(b, 1024 * KIB); +} + +fn bench_incremental(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); +} + +#[bench] +fn bench_incremental_0001_block(b: &mut Bencher) { + bench_incremental(b, BLOCK_LEN); +} + +#[bench] +fn bench_incremental_0001_kib(b: &mut Bencher) { + bench_incremental(b, 1 * KIB); +} + +#[bench] +fn bench_incremental_0002_kib(b: &mut Bencher) { + bench_incremental(b, 2 * KIB); +} + +#[bench] +fn bench_incremental_0004_kib(b: &mut Bencher) { + bench_incremental(b, 4 * KIB); +} + +#[bench] +fn bench_incremental_0008_kib(b: &mut Bencher) { + bench_incremental(b, 8 * KIB); +} + +#[bench] +fn bench_incremental_0016_kib(b: &mut Bencher) { + bench_incremental(b, 16 * KIB); +} + +#[bench] +fn bench_incremental_0032_kib(b: &mut Bencher) { + bench_incremental(b, 32 * KIB); +} + +#[bench] +fn bench_incremental_0064_kib(b: &mut Bencher) { + bench_incremental(b, 64 * KIB); +} + +#[bench] +fn bench_incremental_0128_kib(b: &mut Bencher) { + bench_incremental(b, 128 * KIB); +} + +#[bench] +fn bench_incremental_0256_kib(b: &mut Bencher) { + bench_incremental(b, 256 * KIB); +} + +#[bench] +fn bench_incremental_0512_kib(b: &mut Bencher) { + bench_incremental(b, 512 * KIB); +} + +#[bench] +fn bench_incremental_1024_kib(b: &mut Bencher) { + bench_incremental(b, 1024 * KIB); +} + +fn bench_reference(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +fn bench_reference_0001_block(b: &mut Bencher) { + bench_reference(b, BLOCK_LEN); +} + +#[bench] +fn bench_reference_0001_kib(b: &mut Bencher) { + bench_reference(b, 1 * KIB); +} + +#[bench] +fn bench_reference_0002_kib(b: &mut Bencher) { + bench_reference(b, 2 * KIB); +} + +#[bench] +fn bench_reference_0004_kib(b: &mut Bencher) { + bench_reference(b, 4 * KIB); +} + +#[bench] +fn bench_reference_0008_kib(b: &mut Bencher) { + bench_reference(b, 8 * KIB); +} + +#[bench] +fn bench_reference_0016_kib(b: &mut Bencher) { + bench_reference(b, 16 * KIB); +} + +#[bench] +fn bench_reference_0032_kib(b: &mut Bencher) { + bench_reference(b, 32 * KIB); +} + +#[bench] +fn bench_reference_0064_kib(b: &mut Bencher) { + bench_reference(b, 64 * KIB); +} + +#[bench] +fn bench_reference_0128_kib(b: &mut Bencher) { + bench_reference(b, 128 * KIB); +} + +#[bench] +fn bench_reference_0256_kib(b: &mut Bencher) { + bench_reference(b, 256 * KIB); +} + +#[bench] +fn bench_reference_0512_kib(b: &mut Bencher) { + bench_reference(b, 512 * KIB); +} + +#[bench] +fn bench_reference_1024_kib(b: &mut Bencher) { + bench_reference(b, 1024 * KIB); +} + +#[cfg(feature = "rayon")] +fn bench_rayon(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + blake3::Hasher::new() + .update_with_join::<blake3::join::RayonJoin>(input.get()) + .finalize() + }); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0001_block(b: &mut Bencher) { + bench_rayon(b, BLOCK_LEN); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0001_kib(b: &mut Bencher) { + bench_rayon(b, 1 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0002_kib(b: &mut Bencher) { + bench_rayon(b, 2 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0004_kib(b: &mut Bencher) { + bench_rayon(b, 4 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0008_kib(b: &mut Bencher) { + bench_rayon(b, 8 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0016_kib(b: &mut Bencher) { + bench_rayon(b, 16 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0032_kib(b: &mut Bencher) { + bench_rayon(b, 32 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0064_kib(b: &mut Bencher) { + bench_rayon(b, 64 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0128_kib(b: &mut Bencher) { + bench_rayon(b, 128 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0256_kib(b: &mut Bencher) { + bench_rayon(b, 256 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0512_kib(b: &mut Bencher) { + bench_rayon(b, 512 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_1024_kib(b: &mut Bencher) { + bench_rayon(b, 1024 * KIB); +} + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + hasher.finalize() + }); +} diff --git a/3rdparty/BLAKE3/build.rs b/3rdparty/BLAKE3/build.rs new file mode 100644 index 000000000..ea657d8db --- /dev/null +++ b/3rdparty/BLAKE3/build.rs @@ -0,0 +1,260 @@ +use std::env; + +fn defined(var: &str) -> bool { + println!("cargo:rerun-if-env-changed={}", var); + env::var_os(var).is_some() +} + +fn is_pure() -> bool { + defined("CARGO_FEATURE_PURE") +} + +fn should_prefer_intrinsics() -> bool { + defined("CARGO_FEATURE_PREFER_INTRINSICS") +} + +fn is_neon() -> bool { + defined("CARGO_FEATURE_NEON") +} + +fn is_ci() -> bool { + defined("BLAKE3_CI") +} + +fn warn(warning: &str) { + assert!(!warning.contains("\n")); + println!("cargo:warning={}", warning); + if is_ci() { + println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); + std::process::exit(1); + } +} + +fn target_components() -> Vec<String> { + let target = env::var("TARGET").unwrap(); + target.split("-").map(|s| s.to_string()).collect() +} + +fn is_x86_64() -> bool { + target_components()[0] == "x86_64" +} + +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + +fn is_armv7() -> bool { + target_components()[0] == "armv7" +} + +// Windows targets may be using the MSVC toolchain or the GNU toolchain. The +// right compiler flags to use depend on the toolchain. (And we don't want to +// use flag_if_supported, because we don't want features to be silently +// disabled by old compilers.) +fn is_windows_msvc() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "msvc" +} + +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows_msvc() { + build.flag("-std=c11"); + } + build +} + +#[derive(PartialEq)] +enum CCompilerSupport { + NoCompiler, + NoAVX512, + YesAVX512, +} +use CCompilerSupport::*; + +fn c_compiler_support() -> CCompilerSupport { + let build = new_build(); + let flags_checked; + let support_result: Result<bool, _> = if is_windows_msvc() { + flags_checked = "/arch:AVX512"; + build.is_flag_supported("/arch:AVX512") + } else { + // Check for both of the flags we use. If -mavx512f works, then -mavx512vl + // will probably always work too, but we might as well be thorough. + flags_checked = "-mavx512f and -mavx512vl"; + match build.is_flag_supported("-mavx512f") { + Ok(true) => build.is_flag_supported("-mavx512vl"), + false_or_error => false_or_error, + } + }; + match support_result { + Ok(true) => YesAVX512, + Ok(false) => { + warn(&format!( + "The C compiler {:?} does not support {}.", + build.get_compiler().path(), + flags_checked, + )); + NoAVX512 + } + Err(e) => { + println!("{:?}", e); + warn(&format!( + "No C compiler {:?} detected.", + build.get_compiler().path() + )); + NoCompiler + } + } +} + +fn build_sse2_sse41_avx2_rust_intrinsics() { + // No C code to compile here. Set the cfg flags that enable the Rust SSE2, + // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile + // them. + println!("cargo:rustc-cfg=blake3_sse2_rust"); + println!("cargo:rustc-cfg=blake3_sse41_rust"); + println!("cargo:rustc-cfg=blake3_avx2_rust"); +} + +fn build_sse2_sse41_avx2_assembly() { + // Build the assembly implementations for SSE4.1 and AVX2. This is + // preferred, but it only supports x86_64. + assert!(is_x86_64()); + println!("cargo:rustc-cfg=blake3_sse2_ffi"); + println!("cargo:rustc-cfg=blake3_sse41_ffi"); + println!("cargo:rustc-cfg=blake3_avx2_ffi"); + let mut build = new_build(); + if is_windows_msvc() { + build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); + build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); + build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); + } else if is_windows_gnu() { + build.file("c/blake3_sse2_x86-64_windows_gnu.S"); + build.file("c/blake3_sse41_x86-64_windows_gnu.S"); + build.file("c/blake3_avx2_x86-64_windows_gnu.S"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + build.file("c/blake3_sse2_x86-64_unix.S"); + build.file("c/blake3_sse41_x86-64_unix.S"); + build.file("c/blake3_avx2_x86-64_unix.S"); + } + build.compile("blake3_sse2_sse41_avx2_assembly"); +} + +fn build_avx512_c_intrinsics() { + // This is required on 32-bit x86 targets, since the assembly + // implementation doesn't support support those. + println!("cargo:rustc-cfg=blake3_avx512_ffi"); + let mut build = new_build(); + build.file("c/blake3_avx512.c"); + if is_windows_msvc() { + build.flag("/arch:AVX512"); + } else { + build.flag("-mavx512f"); + build.flag("-mavx512vl"); + } + if is_windows_gnu() { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. + build.flag("-fno-asynchronous-unwind-tables"); + } + build.compile("blake3_avx512_intrinsics"); +} + +fn build_avx512_assembly() { + // Build the assembly implementation for AVX-512. This is preferred, but it + // only supports x86_64. + assert!(is_x86_64()); + println!("cargo:rustc-cfg=blake3_avx512_ffi"); + let mut build = new_build(); + if is_windows_msvc() { + build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); + } else { + if is_windows_gnu() { + build.file("c/blake3_avx512_x86-64_windows_gnu.S"); + } else { + // All non-Windows implementations are assumed to support Linux-style + // assembly. These files do contain a small explicit workaround for + // macOS also. + build.file("c/blake3_avx512_x86-64_unix.S"); + } + // Older versions of Clang require these flags, even for assembly. See + // https://github.com/BLAKE3-team/BLAKE3/issues/79. + build.flag("-mavx512f"); + build.flag("-mavx512vl"); + } + build.compile("blake3_avx512_assembly"); +} + +fn build_neon_c_intrinsics() { + let mut build = new_build(); + // Note that blake3_neon.c normally depends on the blake3_portable.c + // for the single-instance compression function, but we expose + // portable.rs over FFI instead. See ffi_neon.rs. + build.file("c/blake3_neon.c"); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + if is_armv7() { + build.flag("-mfpu=neon-vfpv4"); + build.flag("-mfloat-abi=hard"); + } + build.compile("blake3_neon"); +} + +fn main() -> Result<(), Box<dyn std::error::Error>> { + if is_pure() && is_neon() { + panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); + } + + if is_x86_64() || is_x86_32() { + let support = c_compiler_support(); + if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { + build_sse2_sse41_avx2_rust_intrinsics(); + } else { + // We assume that all C compilers can assemble SSE4.1 and AVX2. We + // don't explicitly check for support. + build_sse2_sse41_avx2_assembly(); + } + + if is_pure() || support == NoCompiler || support == NoAVX512 { + // The binary will not include any AVX-512 code. + } else if is_x86_32() || should_prefer_intrinsics() { + build_avx512_c_intrinsics(); + } else { + build_avx512_assembly(); + } + } + + if is_neon() { + build_neon_c_intrinsics(); + } + + // The `cc` crate doesn't automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. + for file in std::fs::read_dir("c")? { + println!( + "cargo:rerun-if-changed={}", + file?.path().to_str().expect("utf-8") + ); + } + + Ok(()) +} diff --git a/3rdparty/BLAKE3/c/.gitignore b/3rdparty/BLAKE3/c/.gitignore new file mode 100644 index 000000000..0bf608cee --- /dev/null +++ b/3rdparty/BLAKE3/c/.gitignore @@ -0,0 +1,3 @@ +blake3 +example +*.o diff --git a/3rdparty/BLAKE3/c/Makefile.testing b/3rdparty/BLAKE3/c/Makefile.testing new file mode 100644 index 000000000..41e6b8285 --- /dev/null +++ b/3rdparty/BLAKE3/c/Makefile.testing @@ -0,0 +1,78 @@ +# This Makefile is only for testing. C callers should follow the instructions +# in ./README.md to incorporate these C files into their existing build. + +NAME=blake3 +CC=gcc +CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden +LDFLAGS=-pie -Wl,-z,relro,-z,now +TARGETS= +ASM_TARGETS= +EXTRAFLAGS=-Wa,--noexecstack + +ifdef BLAKE3_NO_SSE2 +EXTRAFLAGS += -DBLAKE3_NO_SSE2 +else +TARGETS += blake3_sse2.o +ASM_TARGETS += blake3_sse2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_SSE41 +EXTRAFLAGS += -DBLAKE3_NO_SSE41 +else +TARGETS += blake3_sse41.o +ASM_TARGETS += blake3_sse41_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX2 +EXTRAFLAGS += -DBLAKE3_NO_AVX2 +else +TARGETS += blake3_avx2.o +ASM_TARGETS += blake3_avx2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX512 +EXTRAFLAGS += -DBLAKE3_NO_AVX512 +else +TARGETS += blake3_avx512.o +ASM_TARGETS += blake3_avx512_x86-64_unix.S +endif + +ifdef BLAKE3_USE_NEON +EXTRAFLAGS += -DBLAKE3_USE_NEON +TARGETS += blake3_neon.o +endif + +all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +blake3_sse2.o: blake3_sse2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 + +blake3_sse41.o: blake3_sse41.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 + +blake3_avx2.o: blake3_avx2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 + +blake3_avx512.o: blake3_avx512.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl + +blake3_neon.o: blake3_neon.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ + +test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test: all + ./test.py + +asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test_asm: asm + ./test.py + +example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) + +clean: + rm -f $(NAME) *.o diff --git a/3rdparty/BLAKE3/c/README.md b/3rdparty/BLAKE3/c/README.md new file mode 100644 index 000000000..5e8b4e682 --- /dev/null +++ b/3rdparty/BLAKE3/c/README.md @@ -0,0 +1,270 @@ +The official C implementation of BLAKE3. + +# Example + +An example program that hashes bytes from standard input and prints the +result: + +```c +#include "blake3.h" +#include <stdio.h> +#include <unistd.h> + +int main() { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + ssize_t n; + while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { + blake3_hasher_update(&hasher, buf, n); + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} +``` + +The code above is included in this directory as `example.c`. If you're +on x86\_64 with a Unix-like OS, you can compile a working binary like +this: + +```bash +gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +# API + +## The Struct + +```c +typedef struct { + // private fields +} blake3_hasher; +``` + +An incremental BLAKE3 hashing state, which can accept any number of +updates. This implementation doesn't allocate any heap memory, but +`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes +on x86-64. This size can be reduced by restricting the maximum input +length, as described in Section 5.4 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), +but this implementation doesn't currently support that strategy. + +## Common API Functions + +```c +void blake3_hasher_init( + blake3_hasher *self); +``` + +Initialize a `blake3_hasher` in the default hashing mode. + +--- + +```c +void blake3_hasher_update( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +Add input to the hasher. This can be called any number of times. + +--- + +```c +void blake3_hasher_finalize( + const blake3_hasher *self, + uint8_t *out, + size_t out_len); +``` + +Finalize the hasher and emit an output of any length. This doesn't +modify the hasher itself, and it's possible to finalize again after +adding more input. The constant `BLAKE3_OUT_LEN` provides the default +output length, 32 bytes. + +## Less Common API Functions + +```c +void blake3_hasher_init_keyed( + blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +``` + +Initialize a `blake3_hasher` in the keyed hashing mode. The key must be +exactly 32 bytes. + +--- + +```c +void blake3_hasher_init_derive_key( + blake3_hasher *self, + const char *context); +``` + +Initialize a `blake3_hasher` in the key derivation mode. The context +string is given as an initialization parameter, and afterwards input key +material should be given with `blake3_hasher_update`. The context string +is a null-terminated C string which should be **hardcoded, globally +unique, and application-specific**. The context string should not +include any dynamic input like salts, nonces, or identifiers read from a +database at runtime. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com +2019-12-25 16:18:03 session tokens v1"`. + +This function is intended for application code written in C. For +language bindings, see `blake3_hasher_init_derive_key_raw` below. + +--- + +```c +void blake3_hasher_init_derive_key_raw( + blake3_hasher *self, + const void *context, + size_t context_len); +``` + +As `blake3_hasher_init_derive_key` above, except that the context string +is given as a pointer to an array of arbitrary bytes with a provided +length. This is intended for writing language bindings, where C string +conversion would add unnecessary overhead and new error cases. Unicode +strings should be encoded as UTF-8. + +Application code in C should prefer `blake3_hasher_init_derive_key`, +which takes the context as a C string. If you need to use arbitrary +bytes as a context string in application code, consider whether you're +violating the requirement that context strings should be hardcoded. + +--- + +```c +void blake3_hasher_finalize_seek( + const blake3_hasher *self, + uint64_t seek, + uint8_t *out, + size_t out_len); +``` + +The same as `blake3_hasher_finalize`, but with an additional `seek` +parameter for the starting byte position in the output stream. To +efficiently stream a large output without allocating memory, call this +function in a loop, incrementing `seek` by the output length each time. + +# Building + +This implementation is just C and assembly files. It doesn't include a +public-facing build system. (The `Makefile` in this directory is only +for testing.) Instead, the intention is that you can include these files +in whatever build system you're already using. This section describes +the commands your build system should execute, or which you can execute +by hand. Note that these steps may change in future versions. + +## x86 + +Dynamic dispatch is enabled by default on x86. The implementation will +query the CPU at runtime to detect SIMD support, and it will use the +widest instruction set available. By default, `blake3_dispatch.c` +expects to be linked with code for five different instruction sets: +portable C, SSE2, SSE4.1, AVX2, and AVX-512. + +For each of the x86 SIMD instruction sets, two versions are available, +one in assembly (which is further divided into three flavors: Unix, +Windows MSVC, and Windows GNU) and one using C intrinsics. The assembly +versions are generally preferred: they perform better, they perform more +consistently across different compilers, and they build more quickly. On +the other hand, the assembly versions are x86\_64-only, and you need to +select the right flavor for your target platform. + +Here's an example of building a shared library on x86\_64 Linux using +the assembly implementations: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +When building the intrinsics-based implementations, you need to build +each implementation separately, with the corresponding instruction set +explicitly enabled in the compiler. Here's the same shared library using +the intrinsics-based implementations: + +```bash +gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o +gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o +gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o +gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o +``` + +Note above that building `blake3_avx512.c` requires both `-mavx512f` and +`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` +flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. +MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a +corresponding flag. + +If you want to omit SIMD code entirely, you need to explicitly disable +each instruction set. Here's an example of building a shared library on +x86 with only portable code: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ + -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c +``` + +## ARM NEON + +The NEON implementation is not enabled by default on ARM, since not all +ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an +example of building a shared library on ARM Linux with NEON support: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \ + blake3_portable.c blake3_neon.c +``` + +Note that on some targets (ARMv7 in particular), extra flags may be +required to activate NEON support in the compiler. If you see an error +like... + +``` +/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed +in call to always_inline ‘vaddq_u32’: target specific option mismatch +``` + +...then you may need to add something like `-mfpu=neon-vfpv4 +-mfloat-abi=hard`. + +## Other Platforms + +The portable implementation should work on most other architectures. For +example: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c +``` + +# Differences from the Rust Implementation + +The single-threaded Rust and C implementations use the same algorithms, +and their performance is the same if you use the assembly +implementations or if you compile the intrinsics-based implementations +with Clang. (Both Clang and rustc are LLVM-based.) + +The C implementation doesn't currently include any multithreading +optimizations. OpenMP support or similar might be added in the future. diff --git a/3rdparty/BLAKE3/c/blake3.c b/3rdparty/BLAKE3/c/blake3.c new file mode 100644 index 000000000..7abf5324e --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3.c @@ -0,0 +1,607 @@ +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +#include "blake3.h" +#include "blake3_impl.h" + +const char * blake3_version(void) { + return BLAKE3_VERSION_STRING; +} + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement exendable ouput.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector<uint8_t> v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} diff --git a/3rdparty/BLAKE3/c/blake3.h b/3rdparty/BLAKE3/c/blake3.h new file mode 100644 index 000000000..57ebd5adc --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3.h @@ -0,0 +1,60 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include <stddef.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "0.3.7" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 +#define BLAKE3_MAX_SIMD_DEGREE 16 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +const char * blake3_version(void); +void blake3_hasher_init(blake3_hasher *self); +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/3rdparty/BLAKE3/c/blake3_avx2.c b/3rdparty/BLAKE3/c/blake3_avx2.c new file mode 100644 index 000000000..c5a2ce9e2 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx2.c @@ -0,0 +1,325 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 8 + +INLINE __m256i loadu(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE void storeu(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m256i rot16(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m256i rot12(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); +} + +INLINE __m256i rot8(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, + 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m256i rot7(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); +} + +INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m256i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[8]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(h_vecs); + storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +#if !defined(BLAKE3_NO_SSE41) +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#else +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } +#if !defined(BLAKE3_NO_SSE41) + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#else + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +#endif +} diff --git a/3rdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S new file mode 100644 index 000000000..812bb8568 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S new file mode 100644 index 000000000..bb58d2ae6 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_gnu.S @@ -0,0 +1,1817 @@ +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +.section .text + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x2D0], xmm6 + vmovdqa xmmword ptr [rsp+0x2E0], xmm7 + vmovdqa xmmword ptr [rsp+0x2F0], xmm8 + vmovdqa xmmword ptr [rsp+0x300], xmm9 + vmovdqa xmmword ptr [rsp+0x310], xmm10 + vmovdqa xmmword ptr [rsp+0x320], xmm11 + vmovdqa xmmword ptr [rsp+0x330], xmm12 + vmovdqa xmmword ptr [rsp+0x340], xmm13 + vmovdqa xmmword ptr [rsp+0x350], xmm14 + vmovdqa xmmword ptr [rsp+0x360], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x260], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x2A0], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2C0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2C0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x2A0] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] + vmovdqa ymmword ptr [rsp+0x220], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x240] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x2D0] + vmovdqa xmm7, xmmword ptr [rsp+0x2E0] + vmovdqa xmm8, xmmword ptr [rsp+0x2F0] + vmovdqa xmm9, xmmword ptr [rsp+0x300] + vmovdqa xmm10, xmmword ptr [rsp+0x310] + vmovdqa xmm11, xmmword ptr [rsp+0x320] + vmovdqa xmm12, xmmword ptr [rsp+0x330] + vmovdqa xmm13, xmmword ptr [rsp+0x340] + vmovdqa xmm14, xmmword ptr [rsp+0x350] + vmovdqa xmm15, xmmword ptr [rsp+0x360] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x2C0] + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x260] + vmovaps xmm0, xmmword ptr [rsp+0x220] + vmovaps xmm1, xmmword ptr [rsp+0x230] + vmovaps xmm2, xmmword ptr [rsp+0x240] + vmovaps xmm3, xmmword ptr [rsp+0x250] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x220], xmm0 + vmovaps xmmword ptr [rsp+0x240], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x220] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x224] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x260] + vmovaps ymm0, ymmword ptr [rsp+0x220] + vmovups ymm1, ymmword ptr [rsp+0x228] + vmovaps ymm2, ymmword ptr [rsp+0x240] + vmovups ymm3, ymmword ptr [rsp+0x248] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x220], ymm0 + vmovaps ymmword ptr [rsp+0x240], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x220] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.section .rodata +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm new file mode 100644 index 000000000..352298edd --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx2_x86-64_windows_msvc.asm @@ -0,0 +1,1828 @@ +public _blake3_hash_many_avx2 +public blake3_hash_many_avx2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx2 PROC +_blake3_hash_many_avx2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+2D0H], xmm6 + vmovdqa xmmword ptr [rsp+2E0H], xmm7 + vmovdqa xmmword ptr [rsp+2F0H], xmm8 + vmovdqa xmmword ptr [rsp+300H], xmm9 + vmovdqa xmmword ptr [rsp+310H], xmm10 + vmovdqa xmmword ptr [rsp+320H], xmm11 + vmovdqa xmmword ptr [rsp+330H], xmm12 + vmovdqa xmmword ptr [rsp+340H], xmm13 + vmovdqa xmmword ptr [rsp+350H], xmm14 + vmovdqa xmmword ptr [rsp+360H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+260H], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0] + vpand ymm2, ymm0, ymmword ptr [ADD1] + vmovdqa ymmword ptr [rsp+2A0H], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+220H], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm3 + shl rdx, 6 + mov qword ptr [rsp+2C0H], rdx + cmp rsi, 8 + jc final7blocks +outerloop8: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+2C0H] + cmove eax, ebx + mov dword ptr [rsp+200H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+20H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+40H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+60H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+80H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0E0H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+100H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+120H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+140H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+160H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+180H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+1A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+1C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+1E0H], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+200H] + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+220H] + vpxor ymm13, ymm1, ymmword ptr [rsp+240H] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+2A0H] + vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] + vmovdqa ymmword ptr [rsp+220H], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+240H] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + sub rsi, 8 + cmp rsi, 8 + jnc outerloop8 + test rsi, rsi + jnz final7blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+2D0H] + vmovdqa xmm7, xmmword ptr [rsp+2E0H] + vmovdqa xmm8, xmmword ptr [rsp+2F0H] + vmovdqa xmm9, xmmword ptr [rsp+300H] + vmovdqa xmm10, xmmword ptr [rsp+310H] + vmovdqa xmm11, xmmword ptr [rsp+320H] + vmovdqa xmm12, xmmword ptr [rsp+330H] + vmovdqa xmm13, xmmword ptr [rsp+340H] + vmovdqa xmm14, xmmword ptr [rsp+350H] + vmovdqa xmm15, xmmword ptr [rsp+360H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+2C0H] + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + test rsi, 4H + je final3blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+220H] + vbroadcasti128 ymm13, xmmword ptr [rsp+240H] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 50H + vpermq ymm15, ymm15, 50H + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] + vpblendd ymm14, ymm14, ymm12, 44H + vpblendd ymm15, ymm15, ymm12, 44H + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+20H], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vmovups ymm2, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + vmovups ymm10, ymmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 93H + vpshufd ymm15, ymm15, 93H + vpbroadcastd ymm2, dword ptr [rsp+200H] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+20H] + vpblendd ymm3, ymm3, ymm2, 88H + vpblendd ymm11, ymm11, ymm2, 88H + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vmovdqa ymm10, ymm2 + mov al, 7 +roundloop4: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+40H], ymm4 + nop + vmovdqa ymmword ptr [rsp+60H], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+80H], ymm5 + vmovdqa ymmword ptr [rsp+0A0H], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 93H + vpshufd ymm8, ymm8, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 39H + vpshufd ymm10, ymm10, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 39H + vpshufd ymm8, ymm8, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 93H + vpshufd ymm10, ymm10, 93H + dec al + je endroundloop4 + vmovdqa ymm4, ymmword ptr [rsp+40H] + vmovdqa ymm5, ymmword ptr [rsp+80H] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0FH + vpshufd ymm4, ymm12, 39H + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0AAH + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 88H + vpshufd ymm12, ymm12, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymmword ptr [rsp+40H], ymm13 + vmovdqa ymmword ptr [rsp+80H], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+60H] + vmovdqa ymm13, ymmword ptr [rsp+0A0H] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0FH + vpshufd ymm12, ymm5, 39H + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0AAH + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 88H + vpshufd ymm5, ymm5, 78H + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 1EH + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+40H] + vmovdqa ymm6, ymmword ptr [rsp+80H] + jmp roundloop4 +endroundloop4: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqu xmmword ptr [rbx+40H], xmm8 + vmovdqu xmmword ptr [rbx+50H], xmm9 + vextracti128 xmmword ptr [rbx+60H], ymm8, 01H + vextracti128 xmmword ptr [rbx+70H], ymm9, 01H + vmovaps xmm8, xmmword ptr [rsp+260H] + vmovaps xmm0, xmmword ptr [rsp+220H] + vmovaps xmm1, xmmword ptr [rsp+230H] + vmovaps xmm2, xmmword ptr [rsp+240H] + vmovaps xmm3, xmmword ptr [rsp+250H] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+220H], xmm0 + vmovaps xmmword ptr [rsp+240H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test rsi, 2H + je final1blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp+220H] + vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+224H] + vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + vbroadcasti128 ymm14, xmmword ptr [ROT16] + vbroadcasti128 ymm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+200H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovaps ymm8, ymmword ptr [rsp+260H] + vmovaps ymm0, ymmword ptr [rsp+220H] + vmovups ymm1, ymmword ptr [rsp+228H] + vmovaps ymm2, ymmword ptr [rsp+240H] + vmovups ymm3, ymmword ptr [rsp+248H] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+220H], ymm0 + vmovaps ymmword ptr [rsp+240H], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1blocks: + test rsi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm3, dword ptr [rsp+220H] + vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm14, xmmword ptr [ROT16] + vmovdqa xmm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx2 ENDP +blake3_hash_many_avx2 ENDP +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + +ADD1: + dd 8 dup (8) + +BLAKE3_IV_0: + dd 8 dup (6A09E667H) + +BLAKE3_IV_1: + dd 8 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 8 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 8 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 8 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +_RDATA ENDS +END diff --git a/3rdparty/BLAKE3/c/blake3_avx512.c b/3rdparty/BLAKE3/c/blake3_avx512.c new file mode 100644 index 000000000..77a5c385c --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx512.c @@ -0,0 +1,1204 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu_128(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE __m256i loadu_256(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE __m512i loadu_512(const uint8_t src[64]) { + return _mm512_loadu_si512((const __m512i *)src); +} + +INLINE void storeu_128(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE void storeu_256(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } + +INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } + +INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } + +INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } + +INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } + +INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } + +INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } + +INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } + +INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } + +INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } + +INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } + +INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } + +INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } + +INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } + +/* + * ---------------------------------------------------------------------------- + * compress_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} + +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +/* + * ---------------------------------------------------------------------------- + * hash4_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(__m128i vecs[4]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); +} + +void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1_128(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash8_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[15] = rot16_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot12_256(v[4]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[15] = rot8_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot7_256(v[4]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot16_256(v[15]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[4] = rot12_256(v[4]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot8_256(v[15]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + v[4] = rot7_256(v[4]); +} + +INLINE void transpose_vecs_256(__m256i vecs[8]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_256(&out[0]); + transpose_vecs_256(&out[8]); +} + +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); +} + +void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1_256(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + h_vecs[0] = xor_256(v[0], v[8]); + h_vecs[1] = xor_256(v[1], v[9]); + h_vecs[2] = xor_256(v[2], v[10]); + h_vecs[3] = xor_256(v[3], v[11]); + h_vecs[4] = xor_256(v[4], v[12]); + h_vecs[5] = xor_256(v[5], v[13]); + h_vecs[6] = xor_256(v[6], v[14]); + h_vecs[7] = xor_256(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_256(h_vecs); + storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash16_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[15] = rot16_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot12_512(v[4]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[15] = rot8_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot7_512(v[4]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot16_512(v[15]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[4] = rot12_512(v[4]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot8_512(v[15]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + v[4] = rot7_512(v[4]); +} + +// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order +#define LO_IMM8 0x88 + +INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, LO_IMM8); +} + +// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order +#define HI_IMM8 0xdd + +INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, HI_IMM8); +} + +INLINE void transpose_vecs_512(__m512i vecs[16]) { + // Interleave 32-bit lanes. The _0 unpack is lanes + // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes + // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. + __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); + __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); + __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); + __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); + __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); + __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); + __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); + __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); + __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); + __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); + __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); + __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); + __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); + __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); + __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); + __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); + + // Interleave 64-bit lates. The _0 unpack is lanes + // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes + // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes + // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes + // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. + __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); + __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); + __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); + __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); + __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); + __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); + __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); + __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); + __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); + __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); + __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); + __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); + __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); + __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); + __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); + __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); + + // Interleave 128-bit lanes. The _0 unpack is + // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is + // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. + __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); + __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); + __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); + __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); + __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); + __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); + __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); + __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); + __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); + __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); + __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); + __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); + __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); + __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); + __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); + __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); + + // Interleave 128-bit lanes again for the final outputs. + vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); + vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); + vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); + vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); + vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); + vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); + vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); + vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); + vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); + vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); + vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); + vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); + vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); + vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); + vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); + vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); +} + +INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, + size_t block_offset, __m512i out[16]) { + out[0] = loadu_512(&inputs[0][block_offset]); + out[1] = loadu_512(&inputs[1][block_offset]); + out[2] = loadu_512(&inputs[2][block_offset]); + out[3] = loadu_512(&inputs[3][block_offset]); + out[4] = loadu_512(&inputs[4][block_offset]); + out[5] = loadu_512(&inputs[5][block_offset]); + out[6] = loadu_512(&inputs[6][block_offset]); + out[7] = loadu_512(&inputs[7][block_offset]); + out[8] = loadu_512(&inputs[8][block_offset]); + out[9] = loadu_512(&inputs[9][block_offset]); + out[10] = loadu_512(&inputs[10][block_offset]); + out[11] = loadu_512(&inputs[11][block_offset]); + out[12] = loadu_512(&inputs[12][block_offset]); + out[13] = loadu_512(&inputs[13][block_offset]); + out[14] = loadu_512(&inputs[14][block_offset]); + out[15] = loadu_512(&inputs[15][block_offset]); + for (size_t i = 0; i < 16; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_512(out); +} + +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); + const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i add1 = _mm512_and_si512(mask, add0); + __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); + __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); + __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + __m512i h_vecs[8] = { + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), + }; + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); + __m512i block_flags_vec = set1_512(block_flags); + __m512i msg_vecs[16]; + transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + h_vecs[0] = xor_512(v[0], v[8]); + h_vecs[1] = xor_512(v[1], v[9]); + h_vecs[2] = xor_512(v[2], v[10]); + h_vecs[3] = xor_512(v[3], v[11]); + h_vecs[4] = xor_512(v[4], v[12]); + h_vecs[5] = xor_512(v[5], v[13]); + h_vecs[6] = xor_512(v[6], v[14]); + h_vecs[7] = xor_512(v[7], v[15]); + + block_flags = flags; + } + + // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 + // state vectors. Pad the matrix with zeros. After transposition, store the + // lower half of each vector. + __m512i padded[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + }; + transpose_vecs_512(padded); + _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); + _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); + _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); + _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); + _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); + _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); + _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); + _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); + _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); + _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); + _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); + _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); + _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); + _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); + _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); + _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 16) { + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } + inputs += 16; + num_inputs -= 16; + out = &out[16 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 8) { + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } + inputs += 8; + num_inputs -= 8; + out = &out[8 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 4) { + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/3rdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S new file mode 100644 index 000000000..a06aede0f --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_unix.S @@ -0,0 +1,2585 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S new file mode 100644 index 000000000..e10b9f36c --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_gnu.S @@ -0,0 +1,2615 @@ +.intel_syntax noprefix + +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +.section .text +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x90], xmm6 + vmovdqa xmmword ptr [rsp+0xA0], xmm7 + vmovdqa xmmword ptr [rsp+0xB0], xmm8 + vmovdqa xmmword ptr [rsp+0xC0], xmm9 + vmovdqa xmmword ptr [rsp+0xD0], xmm10 + vmovdqa xmmword ptr [rsp+0xE0], xmm11 + vmovdqa xmmword ptr [rsp+0xF0], xmm12 + vmovdqa xmmword ptr [rsp+0x100], xmm13 + vmovdqa xmmword ptr [rsp+0x110], xmm14 + vmovdqa xmmword ptr [rsp+0x120], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + vmovdqa ymmword ptr [rsp+0x60], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x90], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jne 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x90] + vmovdqa xmm7, xmmword ptr [rsp+0xA0] + vmovdqa xmm8, xmmword ptr [rsp+0xB0] + vmovdqa xmm9, xmmword ptr [rsp+0xC0] + vmovdqa xmm10, xmmword ptr [rsp+0xD0] + vmovdqa xmm11, xmmword ptr [rsp+0xE0] + vmovdqa xmm12, xmmword ptr [rsp+0xF0] + vmovdqa xmm13, xmmword ptr [rsp+0x100] + vmovdqa xmm14, xmmword ptr [rsp+0x110] + vmovdqa xmm15, xmmword ptr [rsp+0x120] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 2b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x40] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x40], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x78] + movzx r12, byte ptr [rbp+0x88] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x40] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+0x10], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + mov r10, qword ptr [rsp+0x78] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+0x10], xmm1 + vmovdqu xmmword ptr [r10+0x20], xmm2 + vmovdqu xmmword ptr [r10+0x30], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + +.section .rodata +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm new file mode 100644 index 000000000..b19efbaae --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_avx512_x86-64_windows_msvc.asm @@ -0,0 +1,2634 @@ +public _blake3_hash_many_avx512 +public blake3_hash_many_avx512 +public blake3_compress_in_place_avx512 +public _blake3_compress_in_place_avx512 +public blake3_compress_xof_avx512 +public _blake3_compress_xof_avx512 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx512 PROC +_blake3_hash_many_avx512 PROC + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+90H], xmm6 + vmovdqa xmmword ptr [rsp+0A0H], xmm7 + vmovdqa xmmword ptr [rsp+0B0H], xmm8 + vmovdqa xmmword ptr [rsp+0C0H], xmm9 + vmovdqa xmmword ptr [rsp+0D0H], xmm10 + vmovdqa xmmword ptr [rsp+0E0H], xmm11 + vmovdqa xmmword ptr [rsp+0F0H], xmm12 + vmovdqa xmmword ptr [rsp+100H], xmm13 + vmovdqa xmmword ptr [rsp+110H], xmm14 + vmovdqa xmmword ptr [rsp+120H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] + vpcmpud k2, ymm2, ymm0, 1 + vpcmpud k3, ymm3, ymm0, 1 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd ymm6, dword ptr [ADD1] + vpaddd ymm4 {k2}, ymm4, ymm6 + vpaddd ymm5 {k3}, ymm5, ymm6 + ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} + ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+20H], ymm3 + vmovdqa ymmword ptr [rsp+40H], ymm4 + vmovdqa ymmword ptr [rsp+60H], ymm5 + shl rdx, 6 + mov qword ptr [rsp+80H], rdx + cmp rsi, 16 + jc final15blocks +outerloop16: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+1H*4H] + vpbroadcastd zmm2, dword ptr [rcx+2H*4H] + vpbroadcastd zmm3, dword ptr [rcx+3H*4H] + vpbroadcastd zmm4, dword ptr [rcx+4H*4H] + vpbroadcastd zmm5, dword ptr [rcx+5H*4H] + vpbroadcastd zmm6, dword ptr [rcx+6H*4H] + vpbroadcastd zmm7, dword ptr [rcx+7H*4H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop16: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0] + vmovdqa32 zmm31, zmmword ptr [INDEX1] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd zmm15, dword ptr [rsp+22H*4H] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop16 + mov rbx, qword ptr [rbp+90H] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 88H + vshufi32x4 zmm17, zmm1, zmm5, 88H + vshufi32x4 zmm18, zmm2, zmm6, 88H + vshufi32x4 zmm19, zmm3, zmm7, 88H + vshufi32x4 zmm20, zmm0, zmm4, 0DDH + vshufi32x4 zmm21, zmm1, zmm5, 0DDH + vshufi32x4 zmm22, zmm2, zmm6, 0DDH + vshufi32x4 zmm23, zmm3, zmm7, 0DDH + vshufi32x4 zmm0, zmm16, zmm17, 88H + vshufi32x4 zmm1, zmm18, zmm19, 88H + vshufi32x4 zmm2, zmm20, zmm21, 88H + vshufi32x4 zmm3, zmm22, zmm23, 88H + vshufi32x4 zmm4, zmm16, zmm17, 0DDH + vshufi32x4 zmm5, zmm18, zmm19, 0DDH + vshufi32x4 zmm6, zmm20, zmm21, 0DDH + vshufi32x4 zmm7, zmm22, zmm23, 0DDH + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 + vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 + vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 + vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 + vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 + vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 + vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] + vmovdqa32 zmm2, zmm0 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd zmm4, dword ptr [ADD16] + vpbroadcastd zmm5, dword ptr [ADD1] + vpaddd zmm2{k1}, zmm0, zmm4 + ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} + vpcmpud k2, zmm2, zmm0, 1 + vpaddd zmm1 {k2}, zmm1, zmm5 + ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+90H], rbx + sub rsi, 16 + cmp rsi, 16 + jnc outerloop16 + test rsi, rsi + jne final15blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+90H] + vmovdqa xmm7, xmmword ptr [rsp+0A0H] + vmovdqa xmm8, xmmword ptr [rsp+0B0H] + vmovdqa xmm9, xmmword ptr [rsp+0C0H] + vmovdqa xmm10, xmmword ptr [rsp+0D0H] + vmovdqa xmm11, xmmword ptr [rsp+0E0H] + vmovdqa xmm12, xmmword ptr [rsp+0F0H] + vmovdqa xmm13, xmmword ptr [rsp+100H] + vmovdqa xmm14, xmmword ptr [rsp+110H] + vmovdqa xmm15, xmmword ptr [rsp+120H] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final15blocks: + test esi, 8H + je final7blocks + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+40H] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd ymm15, dword ptr [rsp+88H] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+40H] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+40H], ymm2 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + add rdi, 64 + sub rsi, 8 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+80H] + movzx r13, byte ptr [rbp+78H] + movzx r12, byte ptr [rbp+88H] + test esi, 4H + je final3blocks + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+40H] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0DCH + vpermq ymm15, ymm15, 0DCH + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] + vinserti64x4 zmm13, zmm14, ymm15, 01H + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+22H*4H] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-30H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-20H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-10H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 93H + vpshufd zmm7, zmm7, 93H + mov al, 7 +roundloop4: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 93H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 39H + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 39H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 93H + dec al + jz endroundloop4 + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0FH + vpshufd zmm4, zmm8, 39H + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 78H + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 1EH + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp roundloop4 +endroundloop4: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H + vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H + vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H + vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test esi, 2H + je final1block + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+4H] + vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+88H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx512 ENDP +blake3_hash_many_avx512 ENDP + +ALIGN 16 +blake3_compress_in_place_avx512 PROC +_blake3_compress_in_place_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+10H], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_in_place_avx512 ENDP +blake3_compress_in_place_avx512 ENDP + +ALIGN 16 +blake3_compress_xof_avx512 PROC +_blake3_compress_xof_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + mov r10, qword ptr [rsp+78H] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+10H] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+10H], xmm1 + vmovdqu xmmword ptr [r10+20H], xmm2 + vmovdqu xmmword ptr [r10+30H], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_xof_avx512 ENDP +blake3_compress_xof_avx512 ENDP + +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +INDEX0: + dd 0, 1, 2, 3, 16, 17, 18, 19 + dd 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + dd 4, 5, 6, 7, 20, 21, 22, 23 + dd 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: + dd 1 +ADD16: + dd 16 +BLAKE3_BLOCK_LEN: + dd 64 +ALIGN 64 +BLAKE3_IV: +BLAKE3_IV_0: + dd 06A09E667H +BLAKE3_IV_1: + dd 0BB67AE85H +BLAKE3_IV_2: + dd 03C6EF372H +BLAKE3_IV_3: + dd 0A54FF53AH + +_RDATA ENDS +END diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml new file mode 100644 index 000000000..2052c7458 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/Cargo.toml @@ -0,0 +1,29 @@ +# These are Rust bindings for the C implementation of BLAKE3. As there is a +# native (and faster) Rust implementation of BLAKE3 provided in this same repo, +# these bindings are not expected to be used in production. They're intended +# for testing and benchmarking. + +[package] +name = "blake3_c_rust_bindings" +version = "0.0.0" +description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation" +edition = "2018" + +[features] +# By default the x86-64 build uses assembly implementations. This feature makes +# the build use the C intrinsics implementations instead. +prefer_intrinsics = [] +# Activate NEON bindings. We don't currently do any CPU feature detection for +# this. If this Cargo feature is on, the NEON gets used. +neon = [] + +[dev-dependencies] +arrayref = "0.3.5" +arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] } +page_size = "0.4.1" +rand = "0.7.2" +rand_chacha = "0.2.1" +reference_impl = { path = "../../reference_impl" } + +[build-dependencies] +cc = "1.0.48" diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/README.md b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/README.md new file mode 100644 index 000000000..c44726b90 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/README.md @@ -0,0 +1,4 @@ +These are Rust bindings for the C implementation of BLAKE3. As there is +a native Rust implementation of BLAKE3 provided in this same repo, these +bindings are not expected to be used in production. They're intended for +testing and benchmarking. diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs new file mode 100644 index 000000000..119bd2064 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/benches/bench.rs @@ -0,0 +1,393 @@ +#![feature(test)] + +extern crate test; + +use arrayref::array_ref; +use arrayvec::ArrayVec; +use rand::prelude::*; +use test::Bencher; + +const KIB: usize = 1024; +const MAX_SIMD_DEGREE: usize = 16; + +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; +const OUT_LEN: usize = 32; + +// This struct randomizes two things: +// 1. The actual bytes of input. +// 2. The page offset the input starts at. +pub struct RandomInput { + buf: Vec<u8>, + len: usize, + offsets: Vec<usize>, + offset_index: usize, +} + +impl RandomInput { + pub fn new(b: &mut Bencher, len: usize) -> Self { + b.bytes += len as u64; + let page_size: usize = page_size::get(); + let mut buf = vec![0u8; len + page_size]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut buf); + let mut offsets: Vec<usize> = (0..page_size).collect(); + offsets.shuffle(&mut rng); + Self { + buf, + len, + offsets, + offset_index: 0, + } + } + + pub fn get(&mut self) -> &[u8] { + let offset = self.offsets[self.offset_index]; + self.offset_index += 1; + if self.offset_index >= self.offsets.len() { + self.offset_index = 0; + } + &self.buf[offset..][..self.len] + } +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) }); +} + +#[bench] +fn bench_single_compression_portable(b: &mut Bencher) { + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41, + ); +} + +#[bench] +fn bench_single_compression_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, CHUNK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + CHUNK_LEN / BLOCK_LEN, + [0u32; 8].as_ptr(), + 0, + true, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_chunks_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_chunks_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +// TODO: When we get const generics we can unify this with the chunks code. +fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, BLOCK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + 1, + [0u32; 8].as_ptr(), + 0, + false, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_parents_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_parents_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +fn bench_incremental(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + hasher.update(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +fn bench_incremental_0001_block(b: &mut Bencher) { + bench_incremental(b, BLOCK_LEN); +} + +#[bench] +fn bench_incremental_0001_kib(b: &mut Bencher) { + bench_incremental(b, 1 * KIB); +} + +#[bench] +fn bench_incremental_0002_kib(b: &mut Bencher) { + bench_incremental(b, 2 * KIB); +} + +#[bench] +fn bench_incremental_0004_kib(b: &mut Bencher) { + bench_incremental(b, 4 * KIB); +} + +#[bench] +fn bench_incremental_0008_kib(b: &mut Bencher) { + bench_incremental(b, 8 * KIB); +} + +#[bench] +fn bench_incremental_0016_kib(b: &mut Bencher) { + bench_incremental(b, 16 * KIB); +} + +#[bench] +fn bench_incremental_0032_kib(b: &mut Bencher) { + bench_incremental(b, 32 * KIB); +} + +#[bench] +fn bench_incremental_0064_kib(b: &mut Bencher) { + bench_incremental(b, 64 * KIB); +} + +#[bench] +fn bench_incremental_0128_kib(b: &mut Bencher) { + bench_incremental(b, 128 * KIB); +} + +#[bench] +fn bench_incremental_0256_kib(b: &mut Bencher) { + bench_incremental(b, 256 * KIB); +} + +#[bench] +fn bench_incremental_0512_kib(b: &mut Bencher) { + bench_incremental(b, 512 * KIB); +} + +#[bench] +fn bench_incremental_1024_kib(b: &mut Bencher) { + bench_incremental(b, 1024 * KIB); +} + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs new file mode 100644 index 000000000..d5dc47a81 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/build.rs @@ -0,0 +1,182 @@ +use std::env; + +fn defined(var: &str) -> bool { + env::var_os(var).is_some() +} + +fn target_components() -> Vec<String> { + let target = env::var("TARGET").unwrap(); + target.split("-").map(|s| s.to_string()).collect() +} + +fn is_x86_64() -> bool { + target_components()[0] == "x86_64" +} + +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + +fn is_armv7() -> bool { + target_components()[0] == "armv7" +} + +// Windows targets may be using the MSVC toolchain or the GNU toolchain. The +// right compiler flags to use depend on the toolchain. (And we don't want to +// use flag_if_supported, because we don't want features to be silently +// disabled by old compilers.) +fn is_windows_msvc() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "msvc" +} + +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows_msvc() { + build.flag("-std=c11"); + } + build +} + +fn c_dir_path(filename: &str) -> String { + // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround + // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular + // building and testing doesn't require this. + if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") { + c_dir_override + "/" + filename + } else { + "../".to_string() + filename + } +} + +fn main() -> Result<(), Box<dyn std::error::Error>> { + let mut base_build = new_build(); + base_build.file(c_dir_path("blake3.c")); + base_build.file(c_dir_path("blake3_dispatch.c")); + base_build.file(c_dir_path("blake3_portable.c")); + base_build.compile("blake3_base"); + + if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") { + // On 64-bit, use the assembly implementations, unless the + // "prefer_intrinsics" feature is enabled. + if is_windows_msvc() { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm")); + build.compile("blake3_asm"); + } else if is_windows_gnu() { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S")); + build.compile("blake3_asm"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_unix.S")); + build.file(c_dir_path("blake3_sse41_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx2_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx512_x86-64_unix.S")); + build.compile("blake3_asm"); + } + } else if is_x86_64() || is_x86_32() { + // Assembly implementations are only for 64-bit. On 32-bit, or if + // the "prefer_intrinsics" feature is enabled, use the + // intrinsics-based C implementations. These each need to be + // compiled separately, with the corresponding instruction set + // extension explicitly enabled in the compiler. + + let mut sse2_build = new_build(); + sse2_build.file(c_dir_path("blake3_sse2.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrisincs: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse2_build.flag("-msse2"); + } + sse2_build.compile("blake3_sse2"); + + let mut sse41_build = new_build(); + sse41_build.file(c_dir_path("blake3_sse41.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrisincs: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse41_build.flag("-msse4.1"); + } + sse41_build.compile("blake3_sse41"); + + let mut avx2_build = new_build(); + avx2_build.file(c_dir_path("blake3_avx2.c")); + if is_windows_msvc() { + avx2_build.flag("/arch:AVX2"); + } else { + avx2_build.flag("-mavx2"); + } + avx2_build.compile("blake3_avx2"); + + let mut avx512_build = new_build(); + avx512_build.file(c_dir_path("blake3_avx512.c")); + if is_windows_msvc() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + avx512_build.flag("/arch:AVX512"); + } else { + avx512_build.flag("-mavx512f"); + avx512_build.flag("-mavx512vl"); + } + avx512_build.compile("blake3_avx512"); + } + + // We only build NEON code here if 1) it's requested and 2) the root crate + // is not already building it. The only time this will really happen is if + // you build this crate by hand with the "neon" feature for some reason. + if defined("CARGO_FEATURE_NEON") { + let mut neon_build = new_build(); + neon_build.file(c_dir_path("blake3_neon.c")); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + if is_armv7() { + neon_build.flag("-mfpu=neon-vfpv4"); + neon_build.flag("-mfloat-abi=hard"); + } + neon_build.compile("blake3_neon"); + } + + // The `cc` crate does not automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. + for file in std::fs::read_dir("..")? { + println!( + "cargo:rerun-if-changed={}", + file?.path().to_str().expect("utf-8") + ); + } + + Ok(()) +} diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh new file mode 100644 index 000000000..94d50affb --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/cross_test.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# This hacky script works around the fact that `cross test` does not support +# path dependencies. (It uses a docker shared folder to let the guest access +# project files, so parent directories aren't available.) Solve this problem by +# copying the entire project to a temp dir and rearranging paths to put "c" and +# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is +# accessible. Hopefully this will just run on CI forever and no one will ever +# read this and discover my deep shame. + +set -e -u -o pipefail + +project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")" +tmpdir="$(mktemp -d)" +echo "Running cross tests in $tmpdir" +cd "$tmpdir" +git clone "$project_root" blake3 +mv blake3/c/blake3_c_rust_bindings . +mv blake3/reference_impl blake3_c_rust_bindings +mv blake3/c blake3_c_rust_bindings +cd blake3_c_rust_bindings +sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml + +export BLAKE3_C_DIR_OVERRIDE="./c" +cat > Cross.toml << EOF +[build.env] +passthrough = [ + "BLAKE3_C_DIR_OVERRIDE", +] +EOF +cross test "$@" diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs new file mode 100644 index 000000000..f18fe123f --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/lib.rs @@ -0,0 +1,299 @@ +//! These are Rust bindings for the C implementation of BLAKE3. As there is a +//! native (and faster) Rust implementation of BLAKE3 provided in this same +//! repo, these bindings are not expected to be used in production. They're +//! intended for testing and benchmarking. + +use std::ffi::{c_void, CString}; +use std::mem::MaybeUninit; + +#[cfg(test)] +mod test; + +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const OUT_LEN: usize = 32; + +// Feature detection functions for tests and benchmarks. Note that the C code +// does its own feature detection in blake3_dispatch.c. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse2_detected() -> bool { + is_x86_feature_detected!("sse2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse41_detected() -> bool { + is_x86_feature_detected!("sse4.1") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx2_detected() -> bool { + is_x86_feature_detected!("avx2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx512_detected() -> bool { + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") +} + +#[derive(Clone)] +pub struct Hasher(ffi::blake3_hasher); + +impl Hasher { + pub fn new() -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init(c_state.as_mut_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_keyed(key: &[u8; 32]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key(context: &str) -> Self { + let mut c_state = MaybeUninit::uninit(); + let context_c_string = CString::new(context).expect("valid C string, no null bytes"); + unsafe { + ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key_raw(context: &[u8]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_derive_key_raw( + c_state.as_mut_ptr(), + context.as_ptr() as *const _, + context.len(), + ); + Self(c_state.assume_init()) + } + } + + pub fn update(&mut self, input: &[u8]) { + unsafe { + ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len()); + } + } + + pub fn finalize(&self, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len()); + } + } + + pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len()); + } + } +} + +pub mod ffi { + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_chunk_state { + pub cv: [u32; 8usize], + pub chunk_counter: u64, + pub buf: [u8; 64usize], + pub buf_len: u8, + pub blocks_compressed: u8, + pub flags: u8, + } + + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_hasher { + pub key: [u32; 8usize], + pub chunk: blake3_chunk_state, + pub cv_stack_len: u8, + pub cv_stack: [u8; 1728usize], + } + + extern "C" { + // public interface + pub fn blake3_hasher_init(self_: *mut blake3_hasher); + pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8); + pub fn blake3_hasher_init_derive_key( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_char, + ); + pub fn blake3_hasher_init_derive_key_raw( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_void, + context_len: usize, + ); + pub fn blake3_hasher_update( + self_: *mut blake3_hasher, + input: *const ::std::os::raw::c_void, + input_len: usize, + ); + pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize); + pub fn blake3_hasher_finalize_seek( + self_: *const blake3_hasher, + seek: u64, + out: *mut u8, + out_len: usize, + ); + + // portable low-level functions + pub fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_portable( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_portable( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub mod x86 { + extern "C" { + // SSE2 low level functions + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // SSE4.1 low level functions + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX2 low level functions + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX-512 low level functions + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + } + + #[cfg(feature = "neon")] + pub mod neon { + extern "C" { + // NEON low level functions + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + } +} diff --git a/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs new file mode 100644 index 000000000..b989ae9c4 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_c_rust_bindings/src/test.rs @@ -0,0 +1,511 @@ +// Most of this code is duplicated from the root `blake3` crate. Perhaps we +// could share more of it in the future. + +use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::ArrayVec; +use core::usize; +use rand::prelude::*; + +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +// const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: [u32; 8] = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largets prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +#[inline(always)] +fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe extern "C" fn( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = CHUNK_END | ROOT | KEYED_HASH; + + let mut portable_out = [0; 64]; + unsafe { + crate::ffi::blake3_compress_xof_portable( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + portable_out.as_mut_ptr(), + ); + } + + let mut test_state = initial_state; + unsafe { + compress_in_place_fn( + test_state.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + }; + let test_state_bytes = le_bytes_from_words_32(&test_state); + let mut test_xof = [0; 64]; + unsafe { + compress_xof_fn( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + test_xof.as_mut_ptr(), + ) + }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_compress_portable() { + test_compress_fn( + crate::ffi::blake3_compress_in_place_portable, + crate::ffi::blake3_compress_xof_portable, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse2() { + if !crate::sse2_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse2, + crate::ffi::x86::blake3_compress_xof_sse2, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse41() { + if !crate::sse41_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse41, + crate::ffi::x86::blake3_compress_xof_sse41, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_avx512() { + if !crate::avx512_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_avx512, + crate::ffi::x86::blake3_compress_xof_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn(hash_many_fn: HashManyFn) { + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + // A counter just prior to u32::MAX. + let counter = (1u64 << 32) - 1; + + // First hash chunks. + let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + portable_chunks_out.as_mut_ptr(), + ); + } + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + test_chunks_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + portable_parents_out.as_mut_ptr(), + ); + } + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + test_parents_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_hash_many_portable() { + test_hash_many_fn(crate::ffi::blake3_hash_many_portable); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse2() { + if !crate::sse2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse41() { + if !crate::sse41_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx2() { + if !crate::avx2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx512() { + if !crate::avx512_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512); +} + +#[test] +#[cfg(feature = "neon")] +fn test_hash_many_neon() { + test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon); +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // the regular C string API + let mut test_hasher = crate::Hasher::new_derive_key(context); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + + // the raw bytes API + let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes()); + test_hasher_raw.update(input); + let mut test_out_raw = [0; OUT]; + test_hasher_raw.finalize(&mut test_out_raw); + assert_eq!(test_out_raw[..], expected_out[..]); + } + } +} + +fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; OUT_LEN]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let mut test_out = [0; OUT_LEN]; + test_hasher.finalize(&mut test_out); + + let expected = reference_hash(total_input); + assert_eq!(expected, test_out); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.gen_range(0, INPUT_MAX + 1); + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + let mut test_out = [0; 32]; + hasher.finalize(&mut test_out); + assert_eq!(expected, test_out); + } +} + +#[test] +fn test_finalize_seek() { + let mut expected = [0; 1000]; + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(b"foobarbaz"); + reference_hasher.finalize(&mut expected); + } + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(b"foobarbaz"); + + let mut out = [0; 103]; + for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] { + dbg!(seek); + test_hasher.finalize_seek(seek as u64, &mut out); + assert_eq!(&expected[seek..][..out.len()], &out[..]); + } +} diff --git a/3rdparty/BLAKE3/c/blake3_dispatch.c b/3rdparty/BLAKE3/c/blake3_dispatch.c new file mode 100644 index 000000000..6518478e5 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_dispatch.c @@ -0,0 +1,276 @@ +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +#include "blake3_impl.h" + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include <intrin.h> +#elif defined(__GNUC__) +#include <immintrin.h> +#else +#error "Unimplemented!" +#endif +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv() { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + enum cpu_feature g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features() { + + if (g_cpu_features != UNDEFINED) { + return g_cpu_features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + enum cpu_feature features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 0)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if defined(BLAKE3_USE_NEON) + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if defined(BLAKE3_USE_NEON) + return 4; +#endif + return 1; +} diff --git a/3rdparty/BLAKE3/c/blake3_impl.h b/3rdparty/BLAKE3/c/blake3_impl.h new file mode 100644 index 000000000..86ab6aa25 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_impl.h @@ -0,0 +1,269 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include <intrin.h> +#endif +#include <immintrin.h> +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif defined(BLAKE3_USE_NEON) +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ __builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#endif + +#if defined(BLAKE3_USE_NEON) +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + + +#endif /* BLAKE3_IMPL_H */ diff --git a/3rdparty/BLAKE3/c/blake3_neon.c b/3rdparty/BLAKE3/c/blake3_neon.c new file mode 100644 index 000000000..46691f526 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_neon.c @@ -0,0 +1,346 @@ +#include "blake3_impl.h" + +#include <arm_neon.h> + +// TODO: This is probably incorrect for big-endian ARM. How should that work? +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + uint32x4_t x; + memcpy(&x, src, 16); + return x; +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + memcpy(dest, &src, 16); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/3rdparty/BLAKE3/c/blake3_portable.c b/3rdparty/BLAKE3/c/blake3_portable.c new file mode 100644 index 000000000..062dd1b47 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include <string.h> + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/3rdparty/BLAKE3/c/blake3_sse2.c b/3rdparty/BLAKE3/c/blake3_sse2.c new file mode 100644 index 000000000..159296688 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse2.c @@ -0,0 +1,565 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/3rdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S new file mode 100644 index 000000000..d144046ab --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_unix.S @@ -0,0 +1,2291 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movd xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse2: +_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S new file mode 100644 index 000000000..494c0c6fd --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_gnu.S @@ -0,0 +1,2332 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +.section .text + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movd xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse2: +blake3_compress_xof_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rodata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm new file mode 100644 index 000000000..72deb7bbc --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse2_x86-64_windows_msvc.asm @@ -0,0 +1,2350 @@ +public _blake3_hash_many_sse2 +public blake3_hash_many_sse2 +public blake3_compress_in_place_sse2 +public _blake3_compress_in_place_sse2 +public blake3_compress_xof_sse2 +public _blake3_compress_xof_sse2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse2 PROC +_blake3_hash_many_sse2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + movd xmm13, dword ptr [rsp+124H] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + shl rax, 20H + or rax, 40H + movd xmm3, rax + movdqa xmmword ptr [rsp+20H], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + punpcklqdq xmm3, xmmword ptr [rsp+20H] + punpcklqdq xmm11, xmmword ptr [rsp+20H] + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm12, xmm13 + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+30H] + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + mov eax, dword ptr [rsp+130H] + neg eax + mov r10d, dword ptr [rsp+110H+8*rax] + mov r11d, dword ptr [rsp+120H+8*rax] + mov dword ptr [rsp+110H], r10d + mov dword ptr [rsp+120H], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse2 ENDP +blake3_hash_many_sse2 ENDP + +blake3_compress_in_place_sse2 PROC +_blake3_compress_in_place_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse2 ENDP +blake3_compress_in_place_sse2 ENDP + +ALIGN 16 +blake3_compress_xof_sse2 PROC +_blake3_compress_xof_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse2 ENDP +blake3_compress_xof_sse2 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +PBLENDW_0x33_MASK: + dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H +PBLENDW_0xCC_MASK: + dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH +PBLENDW_0x3F_MASK: + dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H +PBLENDW_0xC0_MASK: + dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH + +_RDATA ENDS +END diff --git a/3rdparty/BLAKE3/c/blake3_sse41.c b/3rdparty/BLAKE3/c/blake3_sse41.c new file mode 100644 index 000000000..b31122533 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse41.c @@ -0,0 +1,559 @@ +#include "blake3_impl.h" + +#include <immintrin.h> + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; +} + +void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/3rdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S new file mode 100644 index 000000000..a3ff64269 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_unix.S @@ -0,0 +1,2028 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse41: +_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S new file mode 100644 index 000000000..60d0a4042 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_gnu.S @@ -0,0 +1,2069 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +.section .text + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse41: +blake3_compress_xof_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rodata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm new file mode 100644 index 000000000..87001e4d3 --- /dev/null +++ b/3rdparty/BLAKE3/c/blake3_sse41_x86-64_windows_msvc.asm @@ -0,0 +1,2089 @@ +public _blake3_hash_many_sse41 +public blake3_hash_many_sse41 +public blake3_compress_in_place_sse41 +public _blake3_compress_in_place_sse41 +public blake3_compress_xof_sse41 +public _blake3_compress_xof_sse41 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse41 PROC +_blake3_hash_many_sse41 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + pinsrd xmm14, dword ptr [rsp+124H], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0CCH + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0C0H + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0CCH + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0C0H + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + movdqa xmm0, xmmword ptr [rsp+130H] + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm2, xmmword ptr [rsp+120H] + movdqu xmm3, xmmword ptr [rsp+118H] + movdqu xmm4, xmmword ptr [rsp+128H] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+110H], xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse41 ENDP +blake3_hash_many_sse41 ENDP + +blake3_compress_in_place_sse41 PROC +_blake3_compress_in_place_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse41 ENDP +blake3_compress_in_place_sse41 ENDP + +ALIGN 16 +blake3_compress_xof_sse41 PROC +_blake3_compress_xof_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse41 ENDP +blake3_compress_xof_sse41 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +_RDATA ENDS +END + diff --git a/3rdparty/BLAKE3/c/example.c b/3rdparty/BLAKE3/c/example.c new file mode 100644 index 000000000..02fe3c32b --- /dev/null +++ b/3rdparty/BLAKE3/c/example.c @@ -0,0 +1,27 @@ +#include "blake3.h" +#include <stdio.h> +#include <unistd.h> + +int main() { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + ssize_t n; + while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { + blake3_hasher_update(&hasher, buf, n); + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} diff --git a/3rdparty/BLAKE3/c/main.c b/3rdparty/BLAKE3/c/main.c new file mode 100644 index 000000000..9b8a436f3 --- /dev/null +++ b/3rdparty/BLAKE3/c/main.c @@ -0,0 +1,166 @@ +/* + * This main file is intended for testing via `make test`. It does not build in + * other settings. See README.md in this directory for examples of how to build + * C code. + */ + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "blake3.h" +#include "blake3_impl.h" + +#define HASH_MODE 0 +#define KEYED_HASH_MODE 1 +#define DERIVE_KEY_MODE 2 + +static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { + if ('0' <= c && c <= '9') { + *value = c - '0'; + *valid = true; + } else if ('a' <= c && c <= 'f') { + *value = 10 + c - 'a'; + *valid = true; + } else { + *valid = false; + } +} + +static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { + size_t hex_len = strlen(hex_key); + if (hex_len != 64) { + fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", + hex_len); + return 1; + } + for (size_t i = 0; i < 64; i++) { + uint8_t value; + bool valid; + hex_char_value(hex_key[i], &value, &valid); + if (!valid) { + fprintf(stderr, "Invalid hex char.\n"); + return 1; + } + if (i % 2 == 0) { + out[i / 2] = 0; + value <<= 4; + } + out[i / 2] += value; + } + return 0; +} + +/* A little repetition here */ +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +extern enum cpu_feature g_cpu_features; +enum cpu_feature get_cpu_features(); + +int main(int argc, char **argv) { + size_t out_len = BLAKE3_OUT_LEN; + uint8_t key[BLAKE3_KEY_LEN]; + char *context = ""; + uint8_t mode = HASH_MODE; + while (argc > 1) { + if (argc <= 2) { + fprintf(stderr, "Odd number of arguments.\n"); + return 1; + } + if (strcmp("--length", argv[1]) == 0) { + char *endptr = NULL; + errno = 0; + unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); + if (errno != 0 || out_len > SIZE_MAX || endptr == argv[2] || + *endptr != 0) { + fprintf(stderr, "Bad length argument.\n"); + return 1; + } + out_len = (size_t)out_len_ll; + } else if (strcmp("--keyed", argv[1]) == 0) { + mode = KEYED_HASH_MODE; + int ret = parse_key(argv[2], key); + if (ret != 0) { + return ret; + } + } else if (strcmp("--derive-key", argv[1]) == 0) { + mode = DERIVE_KEY_MODE; + context = argv[2]; + } else { + fprintf(stderr, "Unknown flag.\n"); + return 1; + } + argc -= 2; + argv += 2; + } + + /* + * We're going to hash the input multiple times, so we need to buffer it all. + * This is just for test cases, so go ahead and assume that the input is less + * than 1 MiB. + */ + size_t buf_capacity = 1 << 20; + uint8_t *buf = malloc(buf_capacity); + assert(buf != NULL); + size_t buf_len = 0; + while (1) { + size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); + if (n == 0) { + break; + } + buf_len += n; + assert(buf_len < buf_capacity); + } + + const int mask = get_cpu_features(); + int feature = 0; + do { + fprintf(stderr, "Testing 0x%08X\n", feature); + g_cpu_features = feature; + blake3_hasher hasher; + switch (mode) { + case HASH_MODE: + blake3_hasher_init(&hasher); + break; + case KEYED_HASH_MODE: + blake3_hasher_init_keyed(&hasher, key); + break; + case DERIVE_KEY_MODE: + blake3_hasher_init_derive_key(&hasher, context); + break; + default: + abort(); + } + + blake3_hasher_update(&hasher, buf, buf_len); + + /* TODO: An incremental output reader API to avoid this allocation. */ + uint8_t *out = malloc(out_len); + if (out_len > 0 && out == NULL) { + fprintf(stderr, "malloc() failed.\n"); + return 1; + } + blake3_hasher_finalize(&hasher, out, out_len); + for (size_t i = 0; i < out_len; i++) { + printf("%02x", out[i]); + } + printf("\n"); + free(out); + feature = (feature - mask) & mask; + } while (feature != 0); + free(buf); + return 0; +} diff --git a/3rdparty/BLAKE3/c/test.py b/3rdparty/BLAKE3/c/test.py new file mode 100644 index 000000000..b0b192950 --- /dev/null +++ b/3rdparty/BLAKE3/c/test.py @@ -0,0 +1,97 @@ +#! /usr/bin/env python3 + +from binascii import hexlify +import json +from os import path +import subprocess + +HERE = path.dirname(__file__) +TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") +TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) + + +def run_blake3(args, input): + output = subprocess.run([path.join(HERE, "blake3")] + args, + input=input, + stdout=subprocess.PIPE, + check=True) + return output.stdout.decode().strip() + + +# Fill the input with a repeating byte pattern. We use a cycle length of 251, +# because that's the largets prime number less than 256. This makes it unlikely +# to swapping any two adjacent input blocks or chunks will give the same +# answer. +def make_test_input(length): + i = 0 + buf = bytearray() + while len(buf) < length: + buf.append(i) + i = (i + 1) % 251 + return buf + + +def main(): + for case in TEST_VECTORS["cases"]: + input_len = case["input_len"] + input = make_test_input(input_len) + hex_key = hexlify(TEST_VECTORS["key"].encode()) + context_string = TEST_VECTORS["context_string"] + expected_hash_xof = case["hash"] + expected_hash = expected_hash_xof[:64] + expected_keyed_hash_xof = case["keyed_hash"] + expected_keyed_hash = expected_keyed_hash_xof[:64] + expected_derive_key_xof = case["derive_key"] + expected_derive_key = expected_derive_key_xof[:64] + + # Test the default hash. + test_hash = run_blake3([], input) + for line in test_hash.splitlines(): + assert expected_hash == line, \ + "hash({}): {} != {}".format(input_len, expected_hash, line) + + # Test the extended hash. + xof_len = len(expected_hash_xof) // 2 + test_hash_xof = run_blake3(["--length", str(xof_len)], input) + for line in test_hash_xof.splitlines(): + assert expected_hash_xof == line, \ + "hash_xof({}): {} != {}".format( + input_len, expected_hash_xof, line) + + # Test the default keyed hash. + test_keyed_hash = run_blake3(["--keyed", hex_key], input) + for line in test_keyed_hash.splitlines(): + assert expected_keyed_hash == line, \ + "keyed_hash({}): {} != {}".format( + input_len, expected_keyed_hash, line) + + # Test the extended keyed hash. + xof_len = len(expected_keyed_hash_xof) // 2 + test_keyed_hash_xof = run_blake3( + ["--keyed", hex_key, "--length", + str(xof_len)], input) + for line in test_keyed_hash_xof.splitlines(): + assert expected_keyed_hash_xof == line, \ + "keyed_hash_xof({}): {} != {}".format( + input_len, expected_keyed_hash_xof, line) + + # Test the default derive key. + test_derive_key = run_blake3(["--derive-key", context_string], input) + for line in test_derive_key.splitlines(): + assert expected_derive_key == line, \ + "derive_key({}): {} != {}".format( + input_len, expected_derive_key, line) + + # Test the extended derive key. + xof_len = len(expected_derive_key_xof) // 2 + test_derive_key_xof = run_blake3( + ["--derive-key", context_string, "--length", + str(xof_len)], input) + for line in test_derive_key_xof.splitlines(): + assert expected_derive_key_xof == line, \ + "derive_key_xof({}): {} != {}".format( + input_len, expected_derive_key_xof, line) + + +if __name__ == "__main__": + main() diff --git a/3rdparty/BLAKE3/lib/Win64/BLAKE3.lib b/3rdparty/BLAKE3/lib/Win64/BLAKE3.lib Binary files differnew file mode 100644 index 000000000..1308d9928 --- /dev/null +++ b/3rdparty/BLAKE3/lib/Win64/BLAKE3.lib diff --git a/3rdparty/BLAKE3/media/B3.svg b/3rdparty/BLAKE3/media/B3.svg new file mode 100644 index 000000000..a50da0ce9 --- /dev/null +++ b/3rdparty/BLAKE3/media/B3.svg @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="13.356165mm" + height="7.1437497mm" + viewBox="0 0 13.356165 7.1437497" + version="1.1" + id="svg8" + sodipodi:docname="B3.svg" + inkscape:version="0.92.4 5da689c313, 2019-01-14"> + <defs + id="defs2" /> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="4" + inkscape:cx="72.73328" + inkscape:cy="-34.835127" + inkscape:document-units="mm" + inkscape:current-layer="layer1" + showgrid="false" + inkscape:window-width="1920" + inkscape:window-height="1016" + inkscape:window-x="0" + inkscape:window-y="27" + inkscape:window-maximized="1" /> + <metadata + id="metadata5"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title /> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-24.441005,-113.52518)"> + <g + aria-label="B3" + style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" + id="text868"> + <path + d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path814" /> + <path + d="m 35.38417,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814917,-0.23284 0.264583,-0.24341 0.264583,-0.67733 0,-0.85725 -1.090083,-0.85725 h -2.201334 v -1.13242 h 2.169584 q 0.550333,0 0.814916,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path816" /> + </g> + </g> +</svg> diff --git a/3rdparty/BLAKE3/media/BLAKE3.svg b/3rdparty/BLAKE3/media/BLAKE3.svg new file mode 100644 index 000000000..2d50c2d3b --- /dev/null +++ b/3rdparty/BLAKE3/media/BLAKE3.svg @@ -0,0 +1,85 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="43.063534mm" + height="7.2707496mm" + viewBox="0 0 43.063534 7.2707496" + version="1.1" + id="svg8" + sodipodi:docname="BLAKE3.svg" + inkscape:version="0.92.4 5da689c313, 2019-01-14"> + <defs + id="defs2" /> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="4" + inkscape:cx="72.73328" + inkscape:cy="-34.835127" + inkscape:document-units="mm" + inkscape:current-layer="layer1" + showgrid="false" + inkscape:window-width="1920" + inkscape:window-height="1016" + inkscape:window-x="0" + inkscape:window-y="27" + inkscape:window-maximized="1" /> + <metadata + id="metadata5"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-24.441005,-113.39818)"> + <g + aria-label="BLAKE3" + style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332" + id="text868"> + <path + d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path814" /> + <path + d="m 33.22517,113.52518 v 4.66725 q 0,0.254 0.0635,0.48683 0.07408,0.22225 0.243417,0.39159 0.169333,0.15875 0.4445,0.254 0.28575,0.0953 0.709083,0.0953 h 2.772833 v 1.24883 h -2.846916 q -0.709084,0 -1.217084,-0.17992 -0.497416,-0.1905 -0.814916,-0.51858 -0.3175,-0.32808 -0.465667,-0.77258 -0.137583,-0.45509 -0.137583,-0.99484 v -4.67783 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path816" /> + <path + d="M 39.342334,120.66893 H 37.9665 l 2.50825,-6.35 q 0.201084,-0.508 0.560917,-0.70908 0.370417,-0.21167 0.941917,-0.21167 0.560916,0 0.92075,0.21167 0.370416,0.20108 0.560916,0.70908 l 2.413,6.35 h -1.386416 l -2.169584,-5.74675 q -0.09525,-0.24342 -0.34925,-0.24342 -0.254,0 -0.359833,0.24342 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path818" /> + <path + d="m 48.179401,113.52518 v 3.02683 h 0.687917 q 0.455083,0 0.740833,-0.0212 0.296333,-0.0318 0.486833,-0.127 0.1905,-0.0953 0.3175,-0.26459 0.137584,-0.17991 0.28575,-0.47625 l 1.090084,-2.13783 h 1.344083 l -1.121833,2.2225 q -0.243417,0.47625 -0.518584,0.79375 -0.275166,0.3175 -0.719666,0.508 0.254,0.0635 0.4445,0.17992 0.1905,0.10583 0.34925,0.27516 0.169333,0.15875 0.3175,0.39159 0.148166,0.22225 0.306916,0.52916 l 1.153584,2.24367 h -1.397 l -1.090084,-2.11667 q -0.148166,-0.28575 -0.28575,-0.45508 -0.137583,-0.16933 -0.34925,-0.26458 -0.211666,-0.0952 -0.529166,-0.11642 -0.3175,-0.0317 -0.8255,-0.0317 h -0.687917 v 2.9845 h -1.248833 v -7.14375 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path820" /> + <path + d="m 60.127965,113.52518 v 1.24883 h -3.577166 q -0.5715,0 -0.8255,0.24342 -0.254,0.24342 -0.254,0.65617 0,0.84666 1.090083,0.84666 h 3.513667 v 1.13242 h -3.545417 q -1.090083,0 -1.090083,0.86783 0,0.42334 0.264583,0.66675 0.264583,0.23284 0.814917,0.23284 h 3.6195 v 1.24883 h -3.6195 q -0.635,0 -1.090083,-0.14817 -0.4445,-0.14816 -0.740834,-0.40216 -0.28575,-0.26459 -0.423333,-0.62442 -0.127,-0.35983 -0.127,-0.77258 0,-0.61384 0.264583,-1.016 0.264584,-0.41275 0.762,-0.62442 -1.005416,-0.41275 -1.005416,-1.60867 0,-0.42333 0.137583,-0.78316 0.137583,-0.35984 0.423333,-0.61384 0.296334,-0.26458 0.740834,-0.40216 0.455083,-0.14817 1.090083,-0.14817 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path822" /> + <path + d="m 65.091539,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814916,-0.23284 0.264584,-0.24341 0.264584,-0.67733 0,-0.85725 -1.090084,-0.85725 h -2.201333 v -1.13242 h 2.169583 q 0.550334,0 0.814917,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332" + id="path824" /> + </g> + </g> +</svg> diff --git a/3rdparty/BLAKE3/media/speed.svg b/3rdparty/BLAKE3/media/speed.svg new file mode 100644 index 000000000..7bd65ca3c --- /dev/null +++ b/3rdparty/BLAKE3/media/speed.svg @@ -0,0 +1,1474 @@ +<?xml version="1.0" encoding="utf-8" standalone="no"?> +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> +<!-- Created with matplotlib (https://matplotlib.org/) --> +<svg height="331.389812pt" version="1.1" viewBox="0 0 449.761406 331.389812" width="449.761406pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + <defs> + <style type="text/css"> +*{stroke-linecap:butt;stroke-linejoin:round;} + </style> + </defs> + <g id="figure_1"> + <g id="patch_1"> + <path d="M 0 331.389812 +L 449.761406 331.389812 +L 449.761406 0 +L 0 0 +z +" style="fill:#ffffff;"/> + </g> + <g id="axes_1"> + <g id="patch_2"> + <path d="M 71.443906 288.430125 +L 428.563906 288.430125 +L 428.563906 22.318125 +L 71.443906 22.318125 +z +" style="fill:#ffffff;"/> + </g> + <g id="matplotlib.axis_1"> + <g id="xtick_1"> + <g id="line2d_1"> + <defs> + <path d="M 0 0 +L 0 6 +" id="me95d5351a6" style="stroke:#262626;stroke-width:1.25;"/> + </defs> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_1"> + <!-- 0 --> + <defs> + <path d="M 31.78125 66.40625 +Q 24.171875 66.40625 20.328125 58.90625 +Q 16.5 51.421875 16.5 36.375 +Q 16.5 21.390625 20.328125 13.890625 +Q 24.171875 6.390625 31.78125 6.390625 +Q 39.453125 6.390625 43.28125 13.890625 +Q 47.125 21.390625 47.125 36.375 +Q 47.125 51.421875 43.28125 58.90625 +Q 39.453125 66.40625 31.78125 66.40625 +z +M 31.78125 74.21875 +Q 44.046875 74.21875 50.515625 64.515625 +Q 56.984375 54.828125 56.984375 36.375 +Q 56.984375 17.96875 50.515625 8.265625 +Q 44.046875 -1.421875 31.78125 -1.421875 +Q 19.53125 -1.421875 13.0625 8.265625 +Q 6.59375 17.96875 6.59375 36.375 +Q 6.59375 54.828125 13.0625 64.515625 +Q 19.53125 74.21875 31.78125 74.21875 +z +" id="DejaVuSans-48"/> + </defs> + <g style="fill:#262626;" transform="translate(67.944531 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_2"> + <g id="line2d_2"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="116.083906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_2"> + <!-- 1000 --> + <defs> + <path d="M 12.40625 8.296875 +L 28.515625 8.296875 +L 28.515625 63.921875 +L 10.984375 60.40625 +L 10.984375 69.390625 +L 28.421875 72.90625 +L 38.28125 72.90625 +L 38.28125 8.296875 +L 54.390625 8.296875 +L 54.390625 0 +L 12.40625 0 +z +" id="DejaVuSans-49"/> + </defs> + <g style="fill:#262626;" transform="translate(102.086406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-49"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_3"> + <g id="line2d_3"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="160.723906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_3"> + <!-- 2000 --> + <defs> + <path d="M 19.1875 8.296875 +L 53.609375 8.296875 +L 53.609375 0 +L 7.328125 0 +L 7.328125 8.296875 +Q 12.9375 14.109375 22.625 23.890625 +Q 32.328125 33.6875 34.8125 36.53125 +Q 39.546875 41.84375 41.421875 45.53125 +Q 43.3125 49.21875 43.3125 52.78125 +Q 43.3125 58.59375 39.234375 62.25 +Q 35.15625 65.921875 28.609375 65.921875 +Q 23.96875 65.921875 18.8125 64.3125 +Q 13.671875 62.703125 7.8125 59.421875 +L 7.8125 69.390625 +Q 13.765625 71.78125 18.9375 73 +Q 24.125 74.21875 28.421875 74.21875 +Q 39.75 74.21875 46.484375 68.546875 +Q 53.21875 62.890625 53.21875 53.421875 +Q 53.21875 48.921875 51.53125 44.890625 +Q 49.859375 40.875 45.40625 35.40625 +Q 44.1875 33.984375 37.640625 27.21875 +Q 31.109375 20.453125 19.1875 8.296875 +z +" id="DejaVuSans-50"/> + </defs> + <g style="fill:#262626;" transform="translate(146.726406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-50"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_4"> + <g id="line2d_4"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="205.363906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_4"> + <!-- 3000 --> + <defs> + <path d="M 40.578125 39.3125 +Q 47.65625 37.796875 51.625 33 +Q 55.609375 28.21875 55.609375 21.1875 +Q 55.609375 10.40625 48.1875 4.484375 +Q 40.765625 -1.421875 27.09375 -1.421875 +Q 22.515625 -1.421875 17.65625 -0.515625 +Q 12.796875 0.390625 7.625 2.203125 +L 7.625 11.71875 +Q 11.71875 9.328125 16.59375 8.109375 +Q 21.484375 6.890625 26.8125 6.890625 +Q 36.078125 6.890625 40.9375 10.546875 +Q 45.796875 14.203125 45.796875 21.1875 +Q 45.796875 27.640625 41.28125 31.265625 +Q 36.765625 34.90625 28.71875 34.90625 +L 20.21875 34.90625 +L 20.21875 43.015625 +L 29.109375 43.015625 +Q 36.375 43.015625 40.234375 45.921875 +Q 44.09375 48.828125 44.09375 54.296875 +Q 44.09375 59.90625 40.109375 62.90625 +Q 36.140625 65.921875 28.71875 65.921875 +Q 24.65625 65.921875 20.015625 65.03125 +Q 15.375 64.15625 9.8125 62.3125 +L 9.8125 71.09375 +Q 15.4375 72.65625 20.34375 73.4375 +Q 25.25 74.21875 29.59375 74.21875 +Q 40.828125 74.21875 47.359375 69.109375 +Q 53.90625 64.015625 53.90625 55.328125 +Q 53.90625 49.265625 50.4375 45.09375 +Q 46.96875 40.921875 40.578125 39.3125 +z +" id="DejaVuSans-51"/> + </defs> + <g style="fill:#262626;" transform="translate(191.366406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-51"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_5"> + <g id="line2d_5"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="250.003906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_5"> + <!-- 4000 --> + <defs> + <path d="M 37.796875 64.3125 +L 12.890625 25.390625 +L 37.796875 25.390625 +z +M 35.203125 72.90625 +L 47.609375 72.90625 +L 47.609375 25.390625 +L 58.015625 25.390625 +L 58.015625 17.1875 +L 47.609375 17.1875 +L 47.609375 0 +L 37.796875 0 +L 37.796875 17.1875 +L 4.890625 17.1875 +L 4.890625 26.703125 +z +" id="DejaVuSans-52"/> + </defs> + <g style="fill:#262626;" transform="translate(236.006406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-52"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_6"> + <g id="line2d_6"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="294.643906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_6"> + <!-- 5000 --> + <defs> + <path d="M 10.796875 72.90625 +L 49.515625 72.90625 +L 49.515625 64.59375 +L 19.828125 64.59375 +L 19.828125 46.734375 +Q 21.96875 47.46875 24.109375 47.828125 +Q 26.265625 48.1875 28.421875 48.1875 +Q 40.625 48.1875 47.75 41.5 +Q 54.890625 34.8125 54.890625 23.390625 +Q 54.890625 11.625 47.5625 5.09375 +Q 40.234375 -1.421875 26.90625 -1.421875 +Q 22.3125 -1.421875 17.546875 -0.640625 +Q 12.796875 0.140625 7.71875 1.703125 +L 7.71875 11.625 +Q 12.109375 9.234375 16.796875 8.0625 +Q 21.484375 6.890625 26.703125 6.890625 +Q 35.15625 6.890625 40.078125 11.328125 +Q 45.015625 15.765625 45.015625 23.390625 +Q 45.015625 31 40.078125 35.4375 +Q 35.15625 39.890625 26.703125 39.890625 +Q 22.75 39.890625 18.8125 39.015625 +Q 14.890625 38.140625 10.796875 36.28125 +z +" id="DejaVuSans-53"/> + </defs> + <g style="fill:#262626;" transform="translate(280.646406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-53"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_7"> + <g id="line2d_7"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="339.283906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_7"> + <!-- 6000 --> + <defs> + <path d="M 33.015625 40.375 +Q 26.375 40.375 22.484375 35.828125 +Q 18.609375 31.296875 18.609375 23.390625 +Q 18.609375 15.53125 22.484375 10.953125 +Q 26.375 6.390625 33.015625 6.390625 +Q 39.65625 6.390625 43.53125 10.953125 +Q 47.40625 15.53125 47.40625 23.390625 +Q 47.40625 31.296875 43.53125 35.828125 +Q 39.65625 40.375 33.015625 40.375 +z +M 52.59375 71.296875 +L 52.59375 62.3125 +Q 48.875 64.0625 45.09375 64.984375 +Q 41.3125 65.921875 37.59375 65.921875 +Q 27.828125 65.921875 22.671875 59.328125 +Q 17.53125 52.734375 16.796875 39.40625 +Q 19.671875 43.65625 24.015625 45.921875 +Q 28.375 48.1875 33.59375 48.1875 +Q 44.578125 48.1875 50.953125 41.515625 +Q 57.328125 34.859375 57.328125 23.390625 +Q 57.328125 12.15625 50.6875 5.359375 +Q 44.046875 -1.421875 33.015625 -1.421875 +Q 20.359375 -1.421875 13.671875 8.265625 +Q 6.984375 17.96875 6.984375 36.375 +Q 6.984375 53.65625 15.1875 63.9375 +Q 23.390625 74.21875 37.203125 74.21875 +Q 40.921875 74.21875 44.703125 73.484375 +Q 48.484375 72.75 52.59375 71.296875 +z +" id="DejaVuSans-54"/> + </defs> + <g style="fill:#262626;" transform="translate(325.286406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-54"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_8"> + <g id="line2d_8"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="383.923906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_8"> + <!-- 7000 --> + <defs> + <path d="M 8.203125 72.90625 +L 55.078125 72.90625 +L 55.078125 68.703125 +L 28.609375 0 +L 18.3125 0 +L 43.21875 64.59375 +L 8.203125 64.59375 +z +" id="DejaVuSans-55"/> + </defs> + <g style="fill:#262626;" transform="translate(369.926406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-55"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="xtick_9"> + <g id="line2d_9"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="428.563906" xlink:href="#me95d5351a6" y="288.430125"/> + </g> + </g> + <g id="text_9"> + <!-- 8000 --> + <defs> + <path d="M 31.78125 34.625 +Q 24.75 34.625 20.71875 30.859375 +Q 16.703125 27.09375 16.703125 20.515625 +Q 16.703125 13.921875 20.71875 10.15625 +Q 24.75 6.390625 31.78125 6.390625 +Q 38.8125 6.390625 42.859375 10.171875 +Q 46.921875 13.96875 46.921875 20.515625 +Q 46.921875 27.09375 42.890625 30.859375 +Q 38.875 34.625 31.78125 34.625 +z +M 21.921875 38.8125 +Q 15.578125 40.375 12.03125 44.71875 +Q 8.5 49.078125 8.5 55.328125 +Q 8.5 64.0625 14.71875 69.140625 +Q 20.953125 74.21875 31.78125 74.21875 +Q 42.671875 74.21875 48.875 69.140625 +Q 55.078125 64.0625 55.078125 55.328125 +Q 55.078125 49.078125 51.53125 44.71875 +Q 48 40.375 41.703125 38.8125 +Q 48.828125 37.15625 52.796875 32.3125 +Q 56.78125 27.484375 56.78125 20.515625 +Q 56.78125 9.90625 50.3125 4.234375 +Q 43.84375 -1.421875 31.78125 -1.421875 +Q 19.734375 -1.421875 13.25 4.234375 +Q 6.78125 9.90625 6.78125 20.515625 +Q 6.78125 27.484375 10.78125 32.3125 +Q 14.796875 37.15625 21.921875 38.8125 +z +M 18.3125 54.390625 +Q 18.3125 48.734375 21.84375 45.5625 +Q 25.390625 42.390625 31.78125 42.390625 +Q 38.140625 42.390625 41.71875 45.5625 +Q 45.3125 48.734375 45.3125 54.390625 +Q 45.3125 60.0625 41.71875 63.234375 +Q 38.140625 66.40625 31.78125 66.40625 +Q 25.390625 66.40625 21.84375 63.234375 +Q 18.3125 60.0625 18.3125 54.390625 +z +" id="DejaVuSans-56"/> + </defs> + <g style="fill:#262626;" transform="translate(414.566406 306.288406)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-56"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + <use x="190.869141" xlink:href="#DejaVuSans-48"/> + </g> + </g> + </g> + <g id="text_10"> + <!-- Speed (MiB/s) --> + <defs> + <path d="M 53.515625 70.515625 +L 53.515625 60.890625 +Q 47.90625 63.578125 42.921875 64.890625 +Q 37.9375 66.21875 33.296875 66.21875 +Q 25.25 66.21875 20.875 63.09375 +Q 16.5 59.96875 16.5 54.203125 +Q 16.5 49.359375 19.40625 46.890625 +Q 22.3125 44.4375 30.421875 42.921875 +L 36.375 41.703125 +Q 47.40625 39.59375 52.65625 34.296875 +Q 57.90625 29 57.90625 20.125 +Q 57.90625 9.515625 50.796875 4.046875 +Q 43.703125 -1.421875 29.984375 -1.421875 +Q 24.8125 -1.421875 18.96875 -0.25 +Q 13.140625 0.921875 6.890625 3.21875 +L 6.890625 13.375 +Q 12.890625 10.015625 18.65625 8.296875 +Q 24.421875 6.59375 29.984375 6.59375 +Q 38.421875 6.59375 43.015625 9.90625 +Q 47.609375 13.234375 47.609375 19.390625 +Q 47.609375 24.75 44.3125 27.78125 +Q 41.015625 30.8125 33.5 32.328125 +L 27.484375 33.5 +Q 16.453125 35.6875 11.515625 40.375 +Q 6.59375 45.0625 6.59375 53.421875 +Q 6.59375 63.09375 13.40625 68.65625 +Q 20.21875 74.21875 32.171875 74.21875 +Q 37.3125 74.21875 42.625 73.28125 +Q 47.953125 72.359375 53.515625 70.515625 +z +" id="DejaVuSans-83"/> + <path d="M 18.109375 8.203125 +L 18.109375 -20.796875 +L 9.078125 -20.796875 +L 9.078125 54.6875 +L 18.109375 54.6875 +L 18.109375 46.390625 +Q 20.953125 51.265625 25.265625 53.625 +Q 29.59375 56 35.59375 56 +Q 45.5625 56 51.78125 48.09375 +Q 58.015625 40.1875 58.015625 27.296875 +Q 58.015625 14.40625 51.78125 6.484375 +Q 45.5625 -1.421875 35.59375 -1.421875 +Q 29.59375 -1.421875 25.265625 0.953125 +Q 20.953125 3.328125 18.109375 8.203125 +z +M 48.6875 27.296875 +Q 48.6875 37.203125 44.609375 42.84375 +Q 40.53125 48.484375 33.40625 48.484375 +Q 26.265625 48.484375 22.1875 42.84375 +Q 18.109375 37.203125 18.109375 27.296875 +Q 18.109375 17.390625 22.1875 11.75 +Q 26.265625 6.109375 33.40625 6.109375 +Q 40.53125 6.109375 44.609375 11.75 +Q 48.6875 17.390625 48.6875 27.296875 +z +" id="DejaVuSans-112"/> + <path d="M 56.203125 29.59375 +L 56.203125 25.203125 +L 14.890625 25.203125 +Q 15.484375 15.921875 20.484375 11.0625 +Q 25.484375 6.203125 34.421875 6.203125 +Q 39.59375 6.203125 44.453125 7.46875 +Q 49.3125 8.734375 54.109375 11.28125 +L 54.109375 2.78125 +Q 49.265625 0.734375 44.1875 -0.34375 +Q 39.109375 -1.421875 33.890625 -1.421875 +Q 20.796875 -1.421875 13.15625 6.1875 +Q 5.515625 13.8125 5.515625 26.8125 +Q 5.515625 40.234375 12.765625 48.109375 +Q 20.015625 56 32.328125 56 +Q 43.359375 56 49.78125 48.890625 +Q 56.203125 41.796875 56.203125 29.59375 +z +M 47.21875 32.234375 +Q 47.125 39.59375 43.09375 43.984375 +Q 39.0625 48.390625 32.421875 48.390625 +Q 24.90625 48.390625 20.390625 44.140625 +Q 15.875 39.890625 15.1875 32.171875 +z +" id="DejaVuSans-101"/> + <path d="M 45.40625 46.390625 +L 45.40625 75.984375 +L 54.390625 75.984375 +L 54.390625 0 +L 45.40625 0 +L 45.40625 8.203125 +Q 42.578125 3.328125 38.25 0.953125 +Q 33.9375 -1.421875 27.875 -1.421875 +Q 17.96875 -1.421875 11.734375 6.484375 +Q 5.515625 14.40625 5.515625 27.296875 +Q 5.515625 40.1875 11.734375 48.09375 +Q 17.96875 56 27.875 56 +Q 33.9375 56 38.25 53.625 +Q 42.578125 51.265625 45.40625 46.390625 +z +M 14.796875 27.296875 +Q 14.796875 17.390625 18.875 11.75 +Q 22.953125 6.109375 30.078125 6.109375 +Q 37.203125 6.109375 41.296875 11.75 +Q 45.40625 17.390625 45.40625 27.296875 +Q 45.40625 37.203125 41.296875 42.84375 +Q 37.203125 48.484375 30.078125 48.484375 +Q 22.953125 48.484375 18.875 42.84375 +Q 14.796875 37.203125 14.796875 27.296875 +z +" id="DejaVuSans-100"/> + <path id="DejaVuSans-32"/> + <path d="M 31 75.875 +Q 24.46875 64.65625 21.28125 53.65625 +Q 18.109375 42.671875 18.109375 31.390625 +Q 18.109375 20.125 21.3125 9.0625 +Q 24.515625 -2 31 -13.1875 +L 23.1875 -13.1875 +Q 15.875 -1.703125 12.234375 9.375 +Q 8.59375 20.453125 8.59375 31.390625 +Q 8.59375 42.28125 12.203125 53.3125 +Q 15.828125 64.359375 23.1875 75.875 +z +" id="DejaVuSans-40"/> + <path d="M 9.8125 72.90625 +L 24.515625 72.90625 +L 43.109375 23.296875 +L 61.8125 72.90625 +L 76.515625 72.90625 +L 76.515625 0 +L 66.890625 0 +L 66.890625 64.015625 +L 48.09375 14.015625 +L 38.1875 14.015625 +L 19.390625 64.015625 +L 19.390625 0 +L 9.8125 0 +z +" id="DejaVuSans-77"/> + <path d="M 9.421875 54.6875 +L 18.40625 54.6875 +L 18.40625 0 +L 9.421875 0 +z +M 9.421875 75.984375 +L 18.40625 75.984375 +L 18.40625 64.59375 +L 9.421875 64.59375 +z +" id="DejaVuSans-105"/> + <path d="M 19.671875 34.8125 +L 19.671875 8.109375 +L 35.5 8.109375 +Q 43.453125 8.109375 47.28125 11.40625 +Q 51.125 14.703125 51.125 21.484375 +Q 51.125 28.328125 47.28125 31.5625 +Q 43.453125 34.8125 35.5 34.8125 +z +M 19.671875 64.796875 +L 19.671875 42.828125 +L 34.28125 42.828125 +Q 41.5 42.828125 45.03125 45.53125 +Q 48.578125 48.25 48.578125 53.8125 +Q 48.578125 59.328125 45.03125 62.0625 +Q 41.5 64.796875 34.28125 64.796875 +z +M 9.8125 72.90625 +L 35.015625 72.90625 +Q 46.296875 72.90625 52.390625 68.21875 +Q 58.5 63.53125 58.5 54.890625 +Q 58.5 48.1875 55.375 44.234375 +Q 52.25 40.28125 46.1875 39.3125 +Q 53.46875 37.75 57.5 32.78125 +Q 61.53125 27.828125 61.53125 20.40625 +Q 61.53125 10.640625 54.890625 5.3125 +Q 48.25 0 35.984375 0 +L 9.8125 0 +z +" id="DejaVuSans-66"/> + <path d="M 25.390625 72.90625 +L 33.6875 72.90625 +L 8.296875 -9.28125 +L 0 -9.28125 +z +" id="DejaVuSans-47"/> + <path d="M 44.28125 53.078125 +L 44.28125 44.578125 +Q 40.484375 46.53125 36.375 47.5 +Q 32.28125 48.484375 27.875 48.484375 +Q 21.1875 48.484375 17.84375 46.4375 +Q 14.5 44.390625 14.5 40.28125 +Q 14.5 37.15625 16.890625 35.375 +Q 19.28125 33.59375 26.515625 31.984375 +L 29.59375 31.296875 +Q 39.15625 29.25 43.1875 25.515625 +Q 47.21875 21.78125 47.21875 15.09375 +Q 47.21875 7.46875 41.1875 3.015625 +Q 35.15625 -1.421875 24.609375 -1.421875 +Q 20.21875 -1.421875 15.453125 -0.5625 +Q 10.6875 0.296875 5.421875 2 +L 5.421875 11.28125 +Q 10.40625 8.6875 15.234375 7.390625 +Q 20.0625 6.109375 24.8125 6.109375 +Q 31.15625 6.109375 34.5625 8.28125 +Q 37.984375 10.453125 37.984375 14.40625 +Q 37.984375 18.0625 35.515625 20.015625 +Q 33.0625 21.96875 24.703125 23.78125 +L 21.578125 24.515625 +Q 13.234375 26.265625 9.515625 29.90625 +Q 5.8125 33.546875 5.8125 39.890625 +Q 5.8125 47.609375 11.28125 51.796875 +Q 16.75 56 26.8125 56 +Q 31.78125 56 36.171875 55.265625 +Q 40.578125 54.546875 44.28125 53.078125 +z +" id="DejaVuSans-115"/> + <path d="M 8.015625 75.875 +L 15.828125 75.875 +Q 23.140625 64.359375 26.78125 53.3125 +Q 30.421875 42.28125 30.421875 31.390625 +Q 30.421875 20.453125 26.78125 9.375 +Q 23.140625 -1.703125 15.828125 -13.1875 +L 8.015625 -13.1875 +Q 14.5 -2 17.703125 9.0625 +Q 20.90625 20.125 20.90625 31.390625 +Q 20.90625 42.671875 17.703125 53.65625 +Q 14.5 64.65625 8.015625 75.875 +z +" id="DejaVuSans-41"/> + </defs> + <g style="fill:#262626;" transform="translate(208.497031 321.694187)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-83"/> + <use x="63.476562" xlink:href="#DejaVuSans-112"/> + <use x="126.953125" xlink:href="#DejaVuSans-101"/> + <use x="188.476562" xlink:href="#DejaVuSans-101"/> + <use x="250" xlink:href="#DejaVuSans-100"/> + <use x="313.476562" xlink:href="#DejaVuSans-32"/> + <use x="345.263672" xlink:href="#DejaVuSans-40"/> + <use x="384.277344" xlink:href="#DejaVuSans-77"/> + <use x="470.556641" xlink:href="#DejaVuSans-105"/> + <use x="498.339844" xlink:href="#DejaVuSans-66"/> + <use x="566.943359" xlink:href="#DejaVuSans-47"/> + <use x="600.634766" xlink:href="#DejaVuSans-115"/> + <use x="652.734375" xlink:href="#DejaVuSans-41"/> + </g> + </g> + </g> + <g id="matplotlib.axis_2"> + <g id="ytick_1"> + <g id="line2d_10"> + <defs> + <path d="M 0 0 +L -6 0 +" id="m7d1bb602a9" style="stroke:#262626;stroke-width:1.25;"/> + </defs> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="38.950125"/> + </g> + </g> + <g id="text_11"> + <!-- BLAKE3 --> + <defs> + <path d="M 9.8125 72.90625 +L 19.671875 72.90625 +L 19.671875 8.296875 +L 55.171875 8.296875 +L 55.171875 0 +L 9.8125 0 +z +" id="DejaVuSans-76"/> + <path d="M 34.1875 63.1875 +L 20.796875 26.90625 +L 47.609375 26.90625 +z +M 28.609375 72.90625 +L 39.796875 72.90625 +L 67.578125 0 +L 57.328125 0 +L 50.6875 18.703125 +L 17.828125 18.703125 +L 11.1875 0 +L 0.78125 0 +z +" id="DejaVuSans-65"/> + <path d="M 9.8125 72.90625 +L 19.671875 72.90625 +L 19.671875 42.09375 +L 52.390625 72.90625 +L 65.09375 72.90625 +L 28.90625 38.921875 +L 67.671875 0 +L 54.6875 0 +L 19.671875 35.109375 +L 19.671875 0 +L 9.8125 0 +z +" id="DejaVuSans-75"/> + <path d="M 9.8125 72.90625 +L 55.90625 72.90625 +L 55.90625 64.59375 +L 19.671875 64.59375 +L 19.671875 43.015625 +L 54.390625 43.015625 +L 54.390625 34.71875 +L 19.671875 34.71875 +L 19.671875 8.296875 +L 56.78125 8.296875 +L 56.78125 0 +L 9.8125 0 +z +" id="DejaVuSans-69"/> + </defs> + <g style="fill:#262626;" transform="translate(19.576719 43.129266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-66"/> + <use x="68.603516" xlink:href="#DejaVuSans-76"/> + <use x="124.347656" xlink:href="#DejaVuSans-65"/> + <use x="192.755859" xlink:href="#DejaVuSans-75"/> + <use x="258.332031" xlink:href="#DejaVuSans-69"/> + <use x="321.515625" xlink:href="#DejaVuSans-51"/> + </g> + </g> + </g> + <g id="ytick_2"> + <g id="line2d_11"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="72.214125"/> + </g> + </g> + <g id="text_12"> + <!-- BLAKE2b --> + <defs> + <path d="M 48.6875 27.296875 +Q 48.6875 37.203125 44.609375 42.84375 +Q 40.53125 48.484375 33.40625 48.484375 +Q 26.265625 48.484375 22.1875 42.84375 +Q 18.109375 37.203125 18.109375 27.296875 +Q 18.109375 17.390625 22.1875 11.75 +Q 26.265625 6.109375 33.40625 6.109375 +Q 40.53125 6.109375 44.609375 11.75 +Q 48.6875 17.390625 48.6875 27.296875 +z +M 18.109375 46.390625 +Q 20.953125 51.265625 25.265625 53.625 +Q 29.59375 56 35.59375 56 +Q 45.5625 56 51.78125 48.09375 +Q 58.015625 40.1875 58.015625 27.296875 +Q 58.015625 14.40625 51.78125 6.484375 +Q 45.5625 -1.421875 35.59375 -1.421875 +Q 29.59375 -1.421875 25.265625 0.953125 +Q 20.953125 3.328125 18.109375 8.203125 +L 18.109375 0 +L 9.078125 0 +L 9.078125 75.984375 +L 18.109375 75.984375 +z +" id="DejaVuSans-98"/> + </defs> + <g style="fill:#262626;" transform="translate(12.593438 76.393266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-66"/> + <use x="68.603516" xlink:href="#DejaVuSans-76"/> + <use x="124.347656" xlink:href="#DejaVuSans-65"/> + <use x="192.755859" xlink:href="#DejaVuSans-75"/> + <use x="258.332031" xlink:href="#DejaVuSans-69"/> + <use x="321.515625" xlink:href="#DejaVuSans-50"/> + <use x="385.138672" xlink:href="#DejaVuSans-98"/> + </g> + </g> + </g> + <g id="ytick_3"> + <g id="line2d_12"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="105.478125"/> + </g> + </g> + <g id="text_13"> + <!-- SHA-1 --> + <defs> + <path d="M 9.8125 72.90625 +L 19.671875 72.90625 +L 19.671875 43.015625 +L 55.515625 43.015625 +L 55.515625 72.90625 +L 65.375 72.90625 +L 65.375 0 +L 55.515625 0 +L 55.515625 34.71875 +L 19.671875 34.71875 +L 19.671875 0 +L 9.8125 0 +z +" id="DejaVuSans-72"/> + <path d="M 4.890625 31.390625 +L 31.203125 31.390625 +L 31.203125 23.390625 +L 4.890625 23.390625 +z +" id="DejaVuSans-45"/> + </defs> + <g style="fill:#262626;" transform="translate(28.199687 109.657266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-83"/> + <use x="63.476562" xlink:href="#DejaVuSans-72"/> + <use x="138.671875" xlink:href="#DejaVuSans-65"/> + <use x="207.048828" xlink:href="#DejaVuSans-45"/> + <use x="243.132812" xlink:href="#DejaVuSans-49"/> + </g> + </g> + </g> + <g id="ytick_4"> + <g id="line2d_13"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="138.742125"/> + </g> + </g> + <g id="text_14"> + <!-- BLAKE2s --> + <g style="fill:#262626;" transform="translate(13.846406 142.921266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-66"/> + <use x="68.603516" xlink:href="#DejaVuSans-76"/> + <use x="124.347656" xlink:href="#DejaVuSans-65"/> + <use x="192.755859" xlink:href="#DejaVuSans-75"/> + <use x="258.332031" xlink:href="#DejaVuSans-69"/> + <use x="321.515625" xlink:href="#DejaVuSans-50"/> + <use x="385.138672" xlink:href="#DejaVuSans-115"/> + </g> + </g> + </g> + <g id="ytick_5"> + <g id="line2d_14"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="172.006125"/> + </g> + </g> + <g id="text_15"> + <!-- MD5 --> + <defs> + <path d="M 19.671875 64.796875 +L 19.671875 8.109375 +L 31.59375 8.109375 +Q 46.6875 8.109375 53.6875 14.9375 +Q 60.6875 21.78125 60.6875 36.53125 +Q 60.6875 51.171875 53.6875 57.984375 +Q 46.6875 64.796875 31.59375 64.796875 +z +M 9.8125 72.90625 +L 30.078125 72.90625 +Q 51.265625 72.90625 61.171875 64.09375 +Q 71.09375 55.28125 71.09375 36.53125 +Q 71.09375 17.671875 61.125 8.828125 +Q 51.171875 0 30.078125 0 +L 9.8125 0 +z +" id="DejaVuSans-68"/> + </defs> + <g style="fill:#262626;" transform="translate(36.984219 176.185266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-77"/> + <use x="86.279297" xlink:href="#DejaVuSans-68"/> + <use x="163.28125" xlink:href="#DejaVuSans-53"/> + </g> + </g> + </g> + <g id="ytick_6"> + <g id="line2d_15"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="205.270125"/> + </g> + </g> + <g id="text_16"> + <!-- SHA-512 --> + <g style="fill:#262626;" transform="translate(14.202188 209.449266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-83"/> + <use x="63.476562" xlink:href="#DejaVuSans-72"/> + <use x="138.671875" xlink:href="#DejaVuSans-65"/> + <use x="207.048828" xlink:href="#DejaVuSans-45"/> + <use x="243.132812" xlink:href="#DejaVuSans-53"/> + <use x="306.755859" xlink:href="#DejaVuSans-49"/> + <use x="370.378906" xlink:href="#DejaVuSans-50"/> + </g> + </g> + </g> + <g id="ytick_7"> + <g id="line2d_16"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="238.534125"/> + </g> + </g> + <g id="text_17"> + <!-- SHA-256 --> + <g style="fill:#262626;" transform="translate(14.202188 242.713266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-83"/> + <use x="63.476562" xlink:href="#DejaVuSans-72"/> + <use x="138.671875" xlink:href="#DejaVuSans-65"/> + <use x="207.048828" xlink:href="#DejaVuSans-45"/> + <use x="243.132812" xlink:href="#DejaVuSans-50"/> + <use x="306.755859" xlink:href="#DejaVuSans-53"/> + <use x="370.378906" xlink:href="#DejaVuSans-54"/> + </g> + </g> + </g> + <g id="ytick_8"> + <g id="line2d_17"> + <g> + <use style="fill:#262626;stroke:#262626;stroke-width:1.25;" x="71.443906" xlink:href="#m7d1bb602a9" y="271.798125"/> + </g> + </g> + <g id="text_18"> + <!-- SHA3-256 --> + <g style="fill:#262626;" transform="translate(7.2 275.977266)scale(0.11 -0.11)"> + <use xlink:href="#DejaVuSans-83"/> + <use x="63.476562" xlink:href="#DejaVuSans-72"/> + <use x="138.671875" xlink:href="#DejaVuSans-65"/> + <use x="207.080078" xlink:href="#DejaVuSans-51"/> + <use x="270.703125" xlink:href="#DejaVuSans-45"/> + <use x="306.787109" xlink:href="#DejaVuSans-50"/> + <use x="370.410156" xlink:href="#DejaVuSans-53"/> + <use x="434.033203" xlink:href="#DejaVuSans-54"/> + </g> + </g> + </g> + </g> + <g id="patch_3"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 25.644525 +L 377.942146 25.644525 +L 377.942146 52.255725 +L 71.443906 52.255725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_4"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 58.908525 +L 130.011586 58.908525 +L 130.011586 85.519725 +L 71.443906 85.519725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_5"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 92.172525 +L 117.289186 92.172525 +L 117.289186 118.783725 +L 71.443906 118.783725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_6"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 125.436525 +L 110.548546 125.436525 +L 110.548546 152.047725 +L 71.443906 152.047725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_7"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 158.700525 +L 104.477506 158.700525 +L 104.477506 185.311725 +L 71.443906 185.311725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_8"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 191.964525 +L 103.584706 191.964525 +L 103.584706 218.575725 +L 71.443906 218.575725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_9"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 225.228525 +L 93.049666 225.228525 +L 93.049666 251.839725 +L 71.443906 251.839725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="patch_10"> + <path clip-path="url(#p6091bd3d0f)" d="M 71.443906 258.492525 +L 89.032066 258.492525 +L 89.032066 285.103725 +L 71.443906 285.103725 +z +" style="fill:#df2020;stroke:#000000;stroke-linejoin:miter;"/> + </g> + <g id="line2d_18"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_19"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_20"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_21"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_22"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_23"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_24"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="line2d_25"> + <path clip-path="url(#p6091bd3d0f)" d="M 0 0 +" style="fill:none;stroke:#424242;stroke-linecap:round;stroke-width:2.7;"/> + </g> + <g id="patch_11"> + <path d="M 71.443906 288.430125 +L 71.443906 22.318125 +" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> + </g> + <g id="patch_12"> + <path d="M 428.563906 288.430125 +L 428.563906 22.318125 +" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> + </g> + <g id="patch_13"> + <path d="M 71.443906 288.430125 +L 428.563906 288.430125 +" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> + </g> + <g id="patch_14"> + <path d="M 71.443906 22.318125 +L 428.563906 22.318125 +" style="fill:none;stroke:#262626;stroke-linecap:square;stroke-linejoin:miter;stroke-width:1.25;"/> + </g> + <g id="text_19"> + <!-- 6866 --> + <g style="fill:#262626;" transform="translate(382.406146 43.939725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-54"/> + <use x="63.623047" xlink:href="#DejaVuSans-56"/> + <use x="127.246094" xlink:href="#DejaVuSans-54"/> + <use x="190.869141" xlink:href="#DejaVuSans-54"/> + </g> + </g> + <g id="text_20"> + <!-- 1312 --> + <g style="fill:#262626;" transform="translate(134.475586 77.203725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-49"/> + <use x="63.623047" xlink:href="#DejaVuSans-51"/> + <use x="127.246094" xlink:href="#DejaVuSans-49"/> + <use x="190.869141" xlink:href="#DejaVuSans-50"/> + </g> + </g> + <g id="text_21"> + <!-- 1027 --> + <g style="fill:#262626;" transform="translate(121.753186 110.467725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-49"/> + <use x="63.623047" xlink:href="#DejaVuSans-48"/> + <use x="127.246094" xlink:href="#DejaVuSans-50"/> + <use x="190.869141" xlink:href="#DejaVuSans-55"/> + </g> + </g> + <g id="text_22"> + <!-- 876 --> + <g style="fill:#262626;" transform="translate(115.012546 143.731725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-56"/> + <use x="63.623047" xlink:href="#DejaVuSans-55"/> + <use x="127.246094" xlink:href="#DejaVuSans-54"/> + </g> + </g> + <g id="text_23"> + <!-- 740 --> + <g style="fill:#262626;" transform="translate(108.941506 176.995725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-55"/> + <use x="63.623047" xlink:href="#DejaVuSans-52"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + </g> + </g> + <g id="text_24"> + <!-- 720 --> + <g style="fill:#262626;" transform="translate(108.048706 210.259725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-55"/> + <use x="63.623047" xlink:href="#DejaVuSans-50"/> + <use x="127.246094" xlink:href="#DejaVuSans-48"/> + </g> + </g> + <g id="text_25"> + <!-- 484 --> + <g style="fill:#262626;" transform="translate(97.513666 243.523725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-52"/> + <use x="63.623047" xlink:href="#DejaVuSans-56"/> + <use x="127.246094" xlink:href="#DejaVuSans-52"/> + </g> + </g> + <g id="text_26"> + <!-- 394 --> + <defs> + <path d="M 10.984375 1.515625 +L 10.984375 10.5 +Q 14.703125 8.734375 18.5 7.8125 +Q 22.3125 6.890625 25.984375 6.890625 +Q 35.75 6.890625 40.890625 13.453125 +Q 46.046875 20.015625 46.78125 33.40625 +Q 43.953125 29.203125 39.59375 26.953125 +Q 35.25 24.703125 29.984375 24.703125 +Q 19.046875 24.703125 12.671875 31.3125 +Q 6.296875 37.9375 6.296875 49.421875 +Q 6.296875 60.640625 12.9375 67.421875 +Q 19.578125 74.21875 30.609375 74.21875 +Q 43.265625 74.21875 49.921875 64.515625 +Q 56.59375 54.828125 56.59375 36.375 +Q 56.59375 19.140625 48.40625 8.859375 +Q 40.234375 -1.421875 26.421875 -1.421875 +Q 22.703125 -1.421875 18.890625 -0.6875 +Q 15.09375 0.046875 10.984375 1.515625 +z +M 30.609375 32.421875 +Q 37.25 32.421875 41.125 36.953125 +Q 45.015625 41.5 45.015625 49.421875 +Q 45.015625 57.28125 41.125 61.84375 +Q 37.25 66.40625 30.609375 66.40625 +Q 23.96875 66.40625 20.09375 61.84375 +Q 16.21875 57.28125 16.21875 49.421875 +Q 16.21875 41.5 20.09375 36.953125 +Q 23.96875 32.421875 30.609375 32.421875 +z +" id="DejaVuSans-57"/> + </defs> + <g style="fill:#262626;" transform="translate(93.496066 276.787725)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-51"/> + <use x="63.623047" xlink:href="#DejaVuSans-57"/> + <use x="127.246094" xlink:href="#DejaVuSans-52"/> + </g> + </g> + <g id="text_27"> + <!-- Performance on AWS c5.metal, 16 KiB input, 1 thread --> + <defs> + <path d="M 19.671875 64.796875 +L 19.671875 37.40625 +L 32.078125 37.40625 +Q 38.96875 37.40625 42.71875 40.96875 +Q 46.484375 44.53125 46.484375 51.125 +Q 46.484375 57.671875 42.71875 61.234375 +Q 38.96875 64.796875 32.078125 64.796875 +z +M 9.8125 72.90625 +L 32.078125 72.90625 +Q 44.34375 72.90625 50.609375 67.359375 +Q 56.890625 61.8125 56.890625 51.125 +Q 56.890625 40.328125 50.609375 34.8125 +Q 44.34375 29.296875 32.078125 29.296875 +L 19.671875 29.296875 +L 19.671875 0 +L 9.8125 0 +z +" id="DejaVuSans-80"/> + <path d="M 41.109375 46.296875 +Q 39.59375 47.171875 37.8125 47.578125 +Q 36.03125 48 33.890625 48 +Q 26.265625 48 22.1875 43.046875 +Q 18.109375 38.09375 18.109375 28.8125 +L 18.109375 0 +L 9.078125 0 +L 9.078125 54.6875 +L 18.109375 54.6875 +L 18.109375 46.1875 +Q 20.953125 51.171875 25.484375 53.578125 +Q 30.03125 56 36.53125 56 +Q 37.453125 56 38.578125 55.875 +Q 39.703125 55.765625 41.0625 55.515625 +z +" id="DejaVuSans-114"/> + <path d="M 37.109375 75.984375 +L 37.109375 68.5 +L 28.515625 68.5 +Q 23.6875 68.5 21.796875 66.546875 +Q 19.921875 64.59375 19.921875 59.515625 +L 19.921875 54.6875 +L 34.71875 54.6875 +L 34.71875 47.703125 +L 19.921875 47.703125 +L 19.921875 0 +L 10.890625 0 +L 10.890625 47.703125 +L 2.296875 47.703125 +L 2.296875 54.6875 +L 10.890625 54.6875 +L 10.890625 58.5 +Q 10.890625 67.625 15.140625 71.796875 +Q 19.390625 75.984375 28.609375 75.984375 +z +" id="DejaVuSans-102"/> + <path d="M 30.609375 48.390625 +Q 23.390625 48.390625 19.1875 42.75 +Q 14.984375 37.109375 14.984375 27.296875 +Q 14.984375 17.484375 19.15625 11.84375 +Q 23.34375 6.203125 30.609375 6.203125 +Q 37.796875 6.203125 41.984375 11.859375 +Q 46.1875 17.53125 46.1875 27.296875 +Q 46.1875 37.015625 41.984375 42.703125 +Q 37.796875 48.390625 30.609375 48.390625 +z +M 30.609375 56 +Q 42.328125 56 49.015625 48.375 +Q 55.71875 40.765625 55.71875 27.296875 +Q 55.71875 13.875 49.015625 6.21875 +Q 42.328125 -1.421875 30.609375 -1.421875 +Q 18.84375 -1.421875 12.171875 6.21875 +Q 5.515625 13.875 5.515625 27.296875 +Q 5.515625 40.765625 12.171875 48.375 +Q 18.84375 56 30.609375 56 +z +" id="DejaVuSans-111"/> + <path d="M 52 44.1875 +Q 55.375 50.25 60.0625 53.125 +Q 64.75 56 71.09375 56 +Q 79.640625 56 84.28125 50.015625 +Q 88.921875 44.046875 88.921875 33.015625 +L 88.921875 0 +L 79.890625 0 +L 79.890625 32.71875 +Q 79.890625 40.578125 77.09375 44.375 +Q 74.3125 48.1875 68.609375 48.1875 +Q 61.625 48.1875 57.5625 43.546875 +Q 53.515625 38.921875 53.515625 30.90625 +L 53.515625 0 +L 44.484375 0 +L 44.484375 32.71875 +Q 44.484375 40.625 41.703125 44.40625 +Q 38.921875 48.1875 33.109375 48.1875 +Q 26.21875 48.1875 22.15625 43.53125 +Q 18.109375 38.875 18.109375 30.90625 +L 18.109375 0 +L 9.078125 0 +L 9.078125 54.6875 +L 18.109375 54.6875 +L 18.109375 46.1875 +Q 21.1875 51.21875 25.484375 53.609375 +Q 29.78125 56 35.6875 56 +Q 41.65625 56 45.828125 52.96875 +Q 50 49.953125 52 44.1875 +z +" id="DejaVuSans-109"/> + <path d="M 34.28125 27.484375 +Q 23.390625 27.484375 19.1875 25 +Q 14.984375 22.515625 14.984375 16.5 +Q 14.984375 11.71875 18.140625 8.90625 +Q 21.296875 6.109375 26.703125 6.109375 +Q 34.1875 6.109375 38.703125 11.40625 +Q 43.21875 16.703125 43.21875 25.484375 +L 43.21875 27.484375 +z +M 52.203125 31.203125 +L 52.203125 0 +L 43.21875 0 +L 43.21875 8.296875 +Q 40.140625 3.328125 35.546875 0.953125 +Q 30.953125 -1.421875 24.3125 -1.421875 +Q 15.921875 -1.421875 10.953125 3.296875 +Q 6 8.015625 6 15.921875 +Q 6 25.140625 12.171875 29.828125 +Q 18.359375 34.515625 30.609375 34.515625 +L 43.21875 34.515625 +L 43.21875 35.40625 +Q 43.21875 41.609375 39.140625 45 +Q 35.0625 48.390625 27.6875 48.390625 +Q 23 48.390625 18.546875 47.265625 +Q 14.109375 46.140625 10.015625 43.890625 +L 10.015625 52.203125 +Q 14.9375 54.109375 19.578125 55.046875 +Q 24.21875 56 28.609375 56 +Q 40.484375 56 46.34375 49.84375 +Q 52.203125 43.703125 52.203125 31.203125 +z +" id="DejaVuSans-97"/> + <path d="M 54.890625 33.015625 +L 54.890625 0 +L 45.90625 0 +L 45.90625 32.71875 +Q 45.90625 40.484375 42.875 44.328125 +Q 39.84375 48.1875 33.796875 48.1875 +Q 26.515625 48.1875 22.3125 43.546875 +Q 18.109375 38.921875 18.109375 30.90625 +L 18.109375 0 +L 9.078125 0 +L 9.078125 54.6875 +L 18.109375 54.6875 +L 18.109375 46.1875 +Q 21.34375 51.125 25.703125 53.5625 +Q 30.078125 56 35.796875 56 +Q 45.21875 56 50.046875 50.171875 +Q 54.890625 44.34375 54.890625 33.015625 +z +" id="DejaVuSans-110"/> + <path d="M 48.78125 52.59375 +L 48.78125 44.1875 +Q 44.96875 46.296875 41.140625 47.34375 +Q 37.3125 48.390625 33.40625 48.390625 +Q 24.65625 48.390625 19.8125 42.84375 +Q 14.984375 37.3125 14.984375 27.296875 +Q 14.984375 17.28125 19.8125 11.734375 +Q 24.65625 6.203125 33.40625 6.203125 +Q 37.3125 6.203125 41.140625 7.25 +Q 44.96875 8.296875 48.78125 10.40625 +L 48.78125 2.09375 +Q 45.015625 0.34375 40.984375 -0.53125 +Q 36.96875 -1.421875 32.421875 -1.421875 +Q 20.0625 -1.421875 12.78125 6.34375 +Q 5.515625 14.109375 5.515625 27.296875 +Q 5.515625 40.671875 12.859375 48.328125 +Q 20.21875 56 33.015625 56 +Q 37.15625 56 41.109375 55.140625 +Q 45.0625 54.296875 48.78125 52.59375 +z +" id="DejaVuSans-99"/> + <path d="M 3.328125 72.90625 +L 13.28125 72.90625 +L 28.609375 11.28125 +L 43.890625 72.90625 +L 54.984375 72.90625 +L 70.3125 11.28125 +L 85.59375 72.90625 +L 95.609375 72.90625 +L 77.296875 0 +L 64.890625 0 +L 49.515625 63.28125 +L 33.984375 0 +L 21.578125 0 +z +" id="DejaVuSans-87"/> + <path d="M 10.6875 12.40625 +L 21 12.40625 +L 21 0 +L 10.6875 0 +z +" id="DejaVuSans-46"/> + <path d="M 18.3125 70.21875 +L 18.3125 54.6875 +L 36.8125 54.6875 +L 36.8125 47.703125 +L 18.3125 47.703125 +L 18.3125 18.015625 +Q 18.3125 11.328125 20.140625 9.421875 +Q 21.96875 7.515625 27.59375 7.515625 +L 36.8125 7.515625 +L 36.8125 0 +L 27.59375 0 +Q 17.1875 0 13.234375 3.875 +Q 9.28125 7.765625 9.28125 18.015625 +L 9.28125 47.703125 +L 2.6875 47.703125 +L 2.6875 54.6875 +L 9.28125 54.6875 +L 9.28125 70.21875 +z +" id="DejaVuSans-116"/> + <path d="M 9.421875 75.984375 +L 18.40625 75.984375 +L 18.40625 0 +L 9.421875 0 +z +" id="DejaVuSans-108"/> + <path d="M 11.71875 12.40625 +L 22.015625 12.40625 +L 22.015625 4 +L 14.015625 -11.625 +L 7.71875 -11.625 +L 11.71875 4 +z +" id="DejaVuSans-44"/> + <path d="M 8.5 21.578125 +L 8.5 54.6875 +L 17.484375 54.6875 +L 17.484375 21.921875 +Q 17.484375 14.15625 20.5 10.265625 +Q 23.53125 6.390625 29.59375 6.390625 +Q 36.859375 6.390625 41.078125 11.03125 +Q 45.3125 15.671875 45.3125 23.6875 +L 45.3125 54.6875 +L 54.296875 54.6875 +L 54.296875 0 +L 45.3125 0 +L 45.3125 8.40625 +Q 42.046875 3.421875 37.71875 1 +Q 33.40625 -1.421875 27.6875 -1.421875 +Q 18.265625 -1.421875 13.375 4.4375 +Q 8.5 10.296875 8.5 21.578125 +z +M 31.109375 56 +z +" id="DejaVuSans-117"/> + <path d="M 54.890625 33.015625 +L 54.890625 0 +L 45.90625 0 +L 45.90625 32.71875 +Q 45.90625 40.484375 42.875 44.328125 +Q 39.84375 48.1875 33.796875 48.1875 +Q 26.515625 48.1875 22.3125 43.546875 +Q 18.109375 38.921875 18.109375 30.90625 +L 18.109375 0 +L 9.078125 0 +L 9.078125 75.984375 +L 18.109375 75.984375 +L 18.109375 46.1875 +Q 21.34375 51.125 25.703125 53.5625 +Q 30.078125 56 35.796875 56 +Q 45.21875 56 50.046875 50.171875 +Q 54.890625 44.34375 54.890625 33.015625 +z +" id="DejaVuSans-104"/> + </defs> + <g style="fill:#262626;" transform="translate(88.626406 16.318125)scale(0.12 -0.12)"> + <use xlink:href="#DejaVuSans-80"/> + <use x="60.255859" xlink:href="#DejaVuSans-101"/> + <use x="121.779297" xlink:href="#DejaVuSans-114"/> + <use x="162.892578" xlink:href="#DejaVuSans-102"/> + <use x="198.097656" xlink:href="#DejaVuSans-111"/> + <use x="259.279297" xlink:href="#DejaVuSans-114"/> + <use x="300.376953" xlink:href="#DejaVuSans-109"/> + <use x="397.789062" xlink:href="#DejaVuSans-97"/> + <use x="459.068359" xlink:href="#DejaVuSans-110"/> + <use x="522.447266" xlink:href="#DejaVuSans-99"/> + <use x="577.427734" xlink:href="#DejaVuSans-101"/> + <use x="638.951172" xlink:href="#DejaVuSans-32"/> + <use x="670.738281" xlink:href="#DejaVuSans-111"/> + <use x="731.919922" xlink:href="#DejaVuSans-110"/> + <use x="795.298828" xlink:href="#DejaVuSans-32"/> + <use x="827.085938" xlink:href="#DejaVuSans-65"/> + <use x="895.416016" xlink:href="#DejaVuSans-87"/> + <use x="994.292969" xlink:href="#DejaVuSans-83"/> + <use x="1057.769531" xlink:href="#DejaVuSans-32"/> + <use x="1089.556641" xlink:href="#DejaVuSans-99"/> + <use x="1144.537109" xlink:href="#DejaVuSans-53"/> + <use x="1208.160156" xlink:href="#DejaVuSans-46"/> + <use x="1239.947266" xlink:href="#DejaVuSans-109"/> + <use x="1337.359375" xlink:href="#DejaVuSans-101"/> + <use x="1398.882812" xlink:href="#DejaVuSans-116"/> + <use x="1438.091797" xlink:href="#DejaVuSans-97"/> + <use x="1499.371094" xlink:href="#DejaVuSans-108"/> + <use x="1527.154297" xlink:href="#DejaVuSans-44"/> + <use x="1558.941406" xlink:href="#DejaVuSans-32"/> + <use x="1590.728516" xlink:href="#DejaVuSans-49"/> + <use x="1654.351562" xlink:href="#DejaVuSans-54"/> + <use x="1717.974609" xlink:href="#DejaVuSans-32"/> + <use x="1749.761719" xlink:href="#DejaVuSans-75"/> + <use x="1815.337891" xlink:href="#DejaVuSans-105"/> + <use x="1843.121094" xlink:href="#DejaVuSans-66"/> + <use x="1911.724609" xlink:href="#DejaVuSans-32"/> + <use x="1943.511719" xlink:href="#DejaVuSans-105"/> + <use x="1971.294922" xlink:href="#DejaVuSans-110"/> + <use x="2034.673828" xlink:href="#DejaVuSans-112"/> + <use x="2098.150391" xlink:href="#DejaVuSans-117"/> + <use x="2161.529297" xlink:href="#DejaVuSans-116"/> + <use x="2200.738281" xlink:href="#DejaVuSans-44"/> + <use x="2232.525391" xlink:href="#DejaVuSans-32"/> + <use x="2264.3125" xlink:href="#DejaVuSans-49"/> + <use x="2327.935547" xlink:href="#DejaVuSans-32"/> + <use x="2359.722656" xlink:href="#DejaVuSans-116"/> + <use x="2398.931641" xlink:href="#DejaVuSans-104"/> + <use x="2462.310547" xlink:href="#DejaVuSans-114"/> + <use x="2503.392578" xlink:href="#DejaVuSans-101"/> + <use x="2564.916016" xlink:href="#DejaVuSans-97"/> + <use x="2626.195312" xlink:href="#DejaVuSans-100"/> + </g> + </g> + </g> + </g> + <defs> + <clipPath id="p6091bd3d0f"> + <rect height="266.112" width="357.12" x="71.443906" y="22.318125"/> + </clipPath> + </defs> +</svg> diff --git a/3rdparty/BLAKE3/reference_impl/Cargo.toml b/3rdparty/BLAKE3/reference_impl/Cargo.toml new file mode 100644 index 000000000..8c81e5ad9 --- /dev/null +++ b/3rdparty/BLAKE3/reference_impl/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "reference_impl" +version = "0.0.0" +edition = "2018" + +[lib] +name = "reference_impl" +path = "reference_impl.rs" diff --git a/3rdparty/BLAKE3/reference_impl/README.md b/3rdparty/BLAKE3/reference_impl/README.md new file mode 100644 index 000000000..941fafd72 --- /dev/null +++ b/3rdparty/BLAKE3/reference_impl/README.md @@ -0,0 +1,9 @@ +This is the reference implementation of BLAKE3. It is used for testing and +as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +discusses this implementation. You can render docs for this implementation +by running `cargo doc --open` in this directory. + +This implementation is a single file +([`reference_impl.rs`](reference_impl.rs)) with no dependencies. It is +not optimized for performance. diff --git a/3rdparty/BLAKE3/reference_impl/reference_impl.rs b/3rdparty/BLAKE3/reference_impl/reference_impl.rs new file mode 100644 index 000000000..248834319 --- /dev/null +++ b/3rdparty/BLAKE3/reference_impl/reference_impl.rs @@ -0,0 +1,383 @@ +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; +use core::convert::TryInto; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let mut state = [ + chaining_value[0], + chaining_value[1], + chaining_value[2], + chaining_value[3], + chaining_value[4], + chaining_value[5], + chaining_value[6], + chaining_value[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len, + flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + for (bytes_block, word) in bytes.chunks_exact(4).zip(words.iter_mut()) { + *word = u32::from_le_bytes(bytes_block.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + block_len: self.block_len as u32, + counter: self.chunk_counter, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key, 0, flags), + key, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/3rdparty/BLAKE3/src/ffi_avx2.rs b/3rdparty/BLAKE3/src/ffi_avx2.rs new file mode 100644 index 000000000..d805e868e --- /dev/null +++ b/3rdparty/BLAKE3/src/ffi_avx2.rs @@ -0,0 +1,63 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Note that there is no AVX2 implementation of compress_in_place or +// compress_xof. + +// Unsafe because this may only be called on platforms supporting AVX2. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/ffi_avx512.rs b/3rdparty/BLAKE3/src/ffi_avx512.rs new file mode 100644 index 000000000..c1b9f649b --- /dev/null +++ b/3rdparty/BLAKE3/src/ffi_avx512.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_avx512( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx512( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/ffi_neon.rs b/3rdparty/BLAKE3/src/ffi_neon.rs new file mode 100644 index 000000000..889974277 --- /dev/null +++ b/3rdparty/BLAKE3/src/ffi_neon.rs @@ -0,0 +1,82 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting NEON. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_neon( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +// blake3_neon.c normally depends on blake3_portable.c, because the NEON +// implementation only provides 4x compression, and it relies on the portable +// implementation for 1x compression. However, we expose the portable Rust +// implementation here instead, to avoid linking in unnecessary code. +#[no_mangle] +pub extern "C" fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + crate::portable::compress_in_place( + &mut *(cv as *mut [u32; 8]), + &*(block as *const [u8; 64]), + block_len, + counter, + flags, + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + // This entire file is gated on feature="neon", so NEON support is + // assumed here. + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/ffi_sse2.rs b/3rdparty/BLAKE3/src/ffi_sse2.rs new file mode 100644 index 000000000..c49a229ad --- /dev/null +++ b/3rdparty/BLAKE3/src/ffi_sse2.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse2( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/ffi_sse41.rs b/3rdparty/BLAKE3/src/ffi_sse41.rs new file mode 100644 index 000000000..0b64c90a0 --- /dev/null +++ b/3rdparty/BLAKE3/src/ffi_sse41.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse41( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse41( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/guts.rs b/3rdparty/BLAKE3/src/guts.rs new file mode 100644 index 000000000..88dcc86cd --- /dev/null +++ b/3rdparty/BLAKE3/src/guts.rs @@ -0,0 +1,95 @@ +// This module is for incremental use cases like the `bao` crate, which need to +// get their hands on internal chunk and parent chaining values. The vast +// majority of users should ignore this and use the publicly documented +// interface instead. + +#[derive(Clone, Debug)] +pub struct ChunkState(crate::ChunkState); + +impl ChunkState { + // Currently this type only supports the regular hash mode. If an + // incremental user needs keyed_hash or derive_key, we can add that. + pub fn new(chunk_counter: u64) -> Self { + Self(crate::ChunkState::new( + crate::IV, + chunk_counter, + 0, + crate::platform::Platform::detect(), + )) + } + + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.0.update(input); + self + } + + pub fn finalize(&self, is_root: bool) -> crate::Hash { + let output = self.0.output(); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } + } +} + +// As above, this currently assumes the regular hash mode. If an incremental +// user needs keyed_hash or derive_key, we can add that. +pub fn parent_cv( + left_child: &crate::Hash, + right_child: &crate::Hash, + is_root: bool, +) -> crate::Hash { + let output = crate::parent_node_output( + left_child.as_bytes(), + right_child.as_bytes(), + crate::IV, + 0, + crate::platform::Platform::detect(), + ); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_chunk() { + assert_eq!( + crate::hash(b"foo"), + ChunkState::new(0).update(b"foo").finalize(true) + ); + } + + #[test] + fn test_parents() { + let mut hasher = crate::Hasher::new(); + let mut buf = [0; crate::CHUNK_LEN]; + + buf[0] = 'a' as u8; + hasher.update(&buf); + let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); + + buf[0] = 'b' as u8; + hasher.update(&buf); + let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); + + hasher.update(b"c"); + let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); + + let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); + let root = parent_cv(&parent, &chunk2_cv, true); + assert_eq!(hasher.finalize(), root); + } +} diff --git a/3rdparty/BLAKE3/src/join.rs b/3rdparty/BLAKE3/src/join.rs new file mode 100644 index 000000000..60932db1c --- /dev/null +++ b/3rdparty/BLAKE3/src/join.rs @@ -0,0 +1,120 @@ +//! The multi-threading abstractions used by [`Hasher::update_with_join`]. +//! +//! Different implementations of the `Join` trait determine whether +//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large +//! inputs. The `SerialJoin` implementation is single-threaded, and the +//! `RayonJoin` implementation (gated by the `rayon` feature) is +//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like +//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally. +//! +//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and +//! `RayonJoin` is the only non-trivial implementation provided. The only +//! difference between the function signature in the `Join` trait and the +//! underlying one in Rayon, is that the trait method includes two length +//! parameters. This gives an implementation the option of e.g. setting a +//! subtree size threshold below which it keeps splits on the same thread. +//! However, neither of the two provided implementations currently makes use of +//! those parameters. Note that in Rayon, the very first `join` call is more +//! expensive than subsequent calls, because it moves work from the calling +//! thread into the thread pool. That makes a coarse-grained input length +//! threshold in the caller more effective than a fine-grained subtree size +//! threshold after the implementation has already started recursing. +//! +//! # Example +//! +//! ``` +//! // Hash a large input using multi-threading. Note that multi-threading +//! // comes with some overhead, and it can actually hurt performance for small +//! // inputs. The meaning of "small" varies, however, depending on the +//! // platform and the number of threads. (On x86_64, the cutoff tends to be +//! // around 128 KiB.) You should benchmark your own use case to see whether +//! // multi-threading helps. +//! # #[cfg(feature = "rayon")] +//! # { +//! # fn some_large_input() -> &'static [u8] { b"foo" } +//! let input: &[u8] = some_large_input(); +//! let mut hasher = blake3::Hasher::new(); +//! hasher.update_with_join::<blake3::join::RayonJoin>(input); +//! let hash = hasher.finalize(); +//! # } +//! ``` +//! +//! [`Hasher::update_with_join`]: ../struct.Hasher.html#method.update_with_join +//! [`Hasher::update`]: ../struct.Hasher.html#method.update +//! [`hash`]: ../fn.hash.html +//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html + +/// The trait that abstracts over single-threaded and multi-threaded recursion. +/// +/// See the [`join` module docs](index.html) for more details. +pub trait Join { + fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send; +} + +/// The trivial, serial implementation of `Join`. The left and right sides are +/// executed one after the other, on the calling thread. The standalone hashing +/// functions and the `Hasher::update` method use this implementation +/// internally. +/// +/// See the [`join` module docs](index.html) for more details. +pub enum SerialJoin {} + +impl Join for SerialJoin { + #[inline] + fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + (oper_a(), oper_b()) + } +} + +/// The Rayon-based implementation of `Join`. The left and right sides are +/// executed on the Rayon thread pool, potentially in parallel. This +/// implementation is gated by the `rayon` feature, which is off by default. +/// +/// See the [`join` module docs](index.html) for more details. +#[cfg(feature = "rayon")] +pub enum RayonJoin {} + +#[cfg(feature = "rayon")] +impl Join for RayonJoin { + #[inline] + fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + rayon::join(oper_a, oper_b) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_serial_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b, 3, 4)); + } + + #[test] + #[cfg(feature = "rayon")] + fn test_rayon_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b, 3, 4)); + } +} diff --git a/3rdparty/BLAKE3/src/lib.rs b/3rdparty/BLAKE3/src/lib.rs new file mode 100644 index 000000000..bf66b6dae --- /dev/null +++ b/3rdparty/BLAKE3/src/lib.rs @@ -0,0 +1,1359 @@ +//! The official Rust implementation of the [BLAKE3] cryptographic hash +//! function. +//! +//! # Examples +//! +//! ``` +//! # fn main() -> Result<(), Box<dyn std::error::Error>> { +//! // Hash an input all at once. +//! let hash1 = blake3::hash(b"foobarbaz"); +//! +//! // Hash an input incrementally. +//! let mut hasher = blake3::Hasher::new(); +//! hasher.update(b"foo"); +//! hasher.update(b"bar"); +//! hasher.update(b"baz"); +//! let hash2 = hasher.finalize(); +//! assert_eq!(hash1, hash2); +//! +//! // Extended output. OutputReader also implements Read and Seek. +//! # #[cfg(feature = "std")] { +//! let mut output = [0; 1000]; +//! let mut output_reader = hasher.finalize_xof(); +//! output_reader.fill(&mut output); +//! assert_eq!(&output[..32], hash1.as_bytes()); +//! # } +//! +//! // Print a hash as hex. +//! println!("{}", hash1.to_hex()); +//! # Ok(()) +//! # } +//! ``` +//! +//! # Cargo Features +//! +//! The `rayon` feature provides [Rayon]-based multi-threading, in particular +//! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It +//! is disabled by default, but enabled for [docs.rs]. +//! +//! The `neon` feature enables ARM NEON support. Currently there is no runtime +//! CPU feature detection for NEON, so you must only enable this feature for +//! targets that are known to have NEON support. In particular, some ARMv7 +//! targets support NEON, and some don't. +//! +//! The `std` feature (enabled by default) is required for implementations of +//! the [`Write`] and [`Seek`] traits, and also for runtime CPU feature +//! detection. If this feature is disabled, the only way to use the SIMD +//! implementations in this crate is to enable the corresponding instruction +//! sets statically for the entire build, with e.g. `RUSTFLAGS="-C +//! target-cpu=native"`. The resulting binary will not be portable to other +//! machines. +//! +//! [BLAKE3]: https://blake3.io +//! [Rayon]: https://github.com/rayon-rs/rayon +//! [`join::RayonJoin`]: join/enum.RayonJoin.html +//! [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join +//! [docs.rs]: https://docs.rs/ +//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html +//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html + +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(test)] +mod test; + +// The guts module is for incremental use cases like the `bao` crate that need +// to explicitly compute chunk and parent chaining values. It is semi-stable +// and likely to keep working, but largely undocumented and not intended for +// widespread use. +#[doc(hidden)] +pub mod guts; + +// The platform module is pub for benchmarks only. It is not stable. +#[doc(hidden)] +pub mod platform; + +// Platform-specific implementations of the compression function. These +// BLAKE3-specific cfg flags are set in build.rs. +#[cfg(blake3_avx2_rust)] +#[path = "rust_avx2.rs"] +mod avx2; +#[cfg(blake3_avx2_ffi)] +#[path = "ffi_avx2.rs"] +mod avx2; +#[cfg(blake3_avx512_ffi)] +#[path = "ffi_avx512.rs"] +mod avx512; +#[cfg(feature = "neon")] +#[path = "ffi_neon.rs"] +mod neon; +mod portable; +#[cfg(blake3_sse2_rust)] +#[path = "rust_sse2.rs"] +mod sse2; +#[cfg(blake3_sse2_ffi)] +#[path = "ffi_sse2.rs"] +mod sse2; +#[cfg(blake3_sse41_rust)] +#[path = "rust_sse41.rs"] +mod sse41; +#[cfg(blake3_sse41_ffi)] +#[path = "ffi_sse41.rs"] +mod sse41; + +pub mod traits; + +pub mod join; + +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::{ArrayString, ArrayVec}; +use core::cmp; +use core::fmt; +use join::{Join, SerialJoin}; +use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; + +/// The number of bytes in a [`Hash`](struct.Hash.html), 32. +pub const OUT_LEN: usize = 32; + +/// The number of bytes in a key, 32. +pub const KEY_LEN: usize = 32; + +// These constants are pub for incremental use cases like `bao`, as well as +// tests and benchmarks. Most callers should not need them. +#[doc(hidden)] +pub const BLOCK_LEN: usize = 64; +#[doc(hidden)] +pub const CHUNK_LEN: usize = 1024; +#[doc(hidden)] +pub const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 + +// While iterating the compression function within a chunk, the CV is +// represented as words, to avoid doing two extra endianness conversions for +// each compression in the portable implementation. But the hash_many interface +// needs to hash both input bytes and parent nodes, so its better for its +// output CVs to be represented as bytes. +type CVWords = [u32; 8]; +type CVBytes = [u8; 32]; // little-endian + +const IV: &CVWords = &[ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// These are the internal flags that we use to domain separate root/non-root, +// chunk/parent, and chunk beginning/middle/end. These get set at the high end +// of the block flags word in the compression function, so their values start +// high and go down. +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +#[inline] +fn counter_low(counter: u64) -> u32 { + counter as u32 +} + +#[inline] +fn counter_high(counter: u64) -> u32 { + (counter >> 32) as u32 +} + +/// An output of the default size, 32 bytes, which provides constant-time +/// equality checking. +/// +/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an +/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays +/// and slices don't provide constant-time equality checking, which is often a +/// security requirement in software that handles private data. `Hash` doesn't +/// implement [`Deref`] or [`AsRef`], to avoid situations where a type +/// conversion happens implicitly and the constant-time property is +/// accidentally lost. +/// +/// `Hash` provides the [`to_hex`] method for converting to hexadecimal. It +/// doesn't directly support converting from hexadecimal, but here's an example +/// of doing that with the [`hex`] crate: +/// +/// ``` +/// # fn main() -> Result<(), Box<dyn std::error::Error>> { +/// use std::convert::TryInto; +/// +/// let hash_hex = "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"; +/// let hash_bytes = hex::decode(hash_hex)?; +/// let hash_array: [u8; blake3::OUT_LEN] = hash_bytes[..].try_into()?; +/// let hash: blake3::Hash = hash_array.into(); +/// # Ok(()) +/// # } +/// ``` +/// +/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html +/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html +/// [`as_bytes`]: #method.as_bytes +/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html +/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html +/// [`to_hex`]: #method.to_hex +/// [`hex`]: https://crates.io/crates/hex +#[derive(Clone, Copy, Hash)] +pub struct Hash([u8; OUT_LEN]); + +impl Hash { + /// The bytes of the `Hash`. Note that byte arrays don't provide + /// constant-time equality checking, so if you need to compare hashes, + /// prefer the `Hash` type. + #[inline] + pub fn as_bytes(&self) -> &[u8; OUT_LEN] { + &self.0 + } + + /// The hexadecimal encoding of the `Hash`. The returned [`ArrayString`] is + /// a fixed size and doesn't allocate memory on the heap. Note that + /// [`ArrayString`] doesn't provide constant-time equality checking, so if + /// you need to compare hashes, prefer the `Hash` type. + /// + /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html + pub fn to_hex(&self) -> ArrayString<[u8; 2 * OUT_LEN]> { + let mut s = ArrayString::new(); + let table = b"0123456789abcdef"; + for &b in self.0.iter() { + s.push(table[(b >> 4) as usize] as char); + s.push(table[(b & 0xf) as usize] as char); + } + s + } +} + +impl From<[u8; OUT_LEN]> for Hash { + #[inline] + fn from(bytes: [u8; OUT_LEN]) -> Self { + Self(bytes) + } +} + +impl From<Hash> for [u8; OUT_LEN] { + #[inline] + fn from(hash: Hash) -> Self { + hash.0 + } +} + +/// This implementation is constant-time. +impl PartialEq for Hash { + #[inline] + fn eq(&self, other: &Hash) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, &other.0) + } +} + +/// This implementation is constant-time. +impl PartialEq<[u8; OUT_LEN]> for Hash { + #[inline] + fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, other) + } +} + +impl Eq for Hash {} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Formatting field as `&str` to reduce code size since the `Debug` + // dynamic dispatch table for `&str` is likely needed elsewhere already, + // but that for `ArrayString<[u8; 64]>` is not. + let hex = self.to_hex(); + let hex: &str = hex.as_str(); + + f.debug_tuple("Hash").field(&hex).finish() + } +} + +// Each chunk or parent node can produce either a 32-byte chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +#[derive(Clone)] +struct Output { + input_chaining_value: CVWords, + block: [u8; 64], + block_len: u8, + counter: u64, + flags: u8, + platform: Platform, +} + +impl Output { + fn chaining_value(&self) -> CVBytes { + let mut cv = self.input_chaining_value; + self.platform.compress_in_place( + &mut cv, + &self.block, + self.block_len, + self.counter, + self.flags, + ); + platform::le_bytes_from_words_32(&cv) + } + + fn root_hash(&self) -> Hash { + debug_assert_eq!(self.counter, 0); + let mut cv = self.input_chaining_value; + self.platform + .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); + Hash(platform::le_bytes_from_words_32(&cv)) + } + + fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { + self.platform.compress_xof( + &self.input_chaining_value, + &self.block, + self.block_len, + self.counter, + self.flags | ROOT, + ) + } +} + +#[derive(Clone)] +struct ChunkState { + cv: CVWords, + chunk_counter: u64, + buf: [u8; BLOCK_LEN], + buf_len: u8, + blocks_compressed: u8, + flags: u8, + platform: Platform, +} + +impl ChunkState { + fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { + Self { + cv: *key, + chunk_counter, + buf: [0; BLOCK_LEN], + buf_len: 0, + blocks_compressed: 0, + flags, + platform, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let want = BLOCK_LEN - self.buf_len as usize; + let take = cmp::min(want, input.len()); + self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); + self.buf_len += take as u8; + *input = &input[take..]; + } + + fn start_flag(&self) -> u8 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + // Try to avoid buffering as much as possible, by compressing directly from + // the input slice when full blocks are available. + fn update(&mut self, mut input: &[u8]) -> &mut Self { + if self.buf_len > 0 { + self.fill_buf(&mut input); + if !input.is_empty() { + debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + &self.buf, + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.buf_len = 0; + self.buf = [0; BLOCK_LEN]; + self.blocks_compressed += 1; + } + } + + while input.len() > BLOCK_LEN { + debug_assert_eq!(self.buf_len, 0); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + array_ref!(input, 0, BLOCK_LEN), + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.blocks_compressed += 1; + input = &input[BLOCK_LEN..]; + } + + self.fill_buf(&mut input); + debug_assert!(input.is_empty()); + debug_assert!(self.len() <= CHUNK_LEN); + self + } + + fn output(&self) -> Output { + let block_flags = self.flags | self.start_flag() | CHUNK_END; + Output { + input_chaining_value: self.cv, + block: self.buf, + block_len: self.buf_len, + counter: self.chunk_counter, + flags: block_flags, + platform: self.platform, + } + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for ChunkState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("ChunkState") + .field("len", &self.len()) + .field("chunk_counter", &self.chunk_counter) + .field("flags", &self.flags) + .field("platform", &self.platform) + .finish() + } +} + +// IMPLEMENTATION NOTE +// =================== +// The recursive function compress_subtree_wide(), implemented below, is the +// basis of high-performance BLAKE3. We use it both for all-at-once hashing, +// and for the incremental input with Hasher (though we have to be careful with +// subtree boundaries in the incremental case). compress_subtree_wide() applies +// several optimizations at the same time: +// - Multi-threading with Rayon. +// - Parallel chunk hashing with SIMD. +// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing +// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues +// to benefit from larger inputs, because more levels of the tree benefit can +// use full-width SIMD vectors for parent hashing. Without parallel parent +// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. + +// pub for benchmarks +#[doc(hidden)] +#[derive(Clone, Copy)] +pub enum IncrementCounter { + Yes, + No, +} + +impl IncrementCounter { + #[inline] + fn yes(&self) -> bool { + match self { + IncrementCounter::Yes => true, + IncrementCounter::No => false, + } + } +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +fn compress_chunks_parallel( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert!(!input.is_empty(), "empty chunks below the root"); + debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); + + let mut chunks_exact = input.chunks_exact(CHUNK_LEN); + let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new(); + for chunk in &mut chunks_exact { + chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); + } + platform.hash_many( + &chunks_array, + key, + chunk_counter, + IncrementCounter::Yes, + flags, + CHUNK_START, + CHUNK_END, + out, + ); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + let chunks_so_far = chunks_array.len(); + if !chunks_exact.remainder().is_empty() { + let counter = chunk_counter + chunks_so_far as u64; + let mut chunk_state = ChunkState::new(key, counter, flags, platform); + chunk_state.update(chunks_exact.remainder()); + *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = + chunk_state.output().chaining_value(); + chunks_so_far + 1 + } else { + chunks_so_far + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +fn compress_parents_parallel( + child_chaining_values: &[u8], + key: &CVWords, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); + let num_children = child_chaining_values.len() / OUT_LEN; + debug_assert!(num_children >= 2, "not enough children"); + debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); + + let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); + // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of + // the requirements of compress_subtree_wide(). + let mut parents_array = ArrayVec::<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE_OR_2]>::new(); + for parent in &mut parents_exact { + parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); + } + platform.hash_many( + &parents_array, + key, + 0, // Parents always use counter 0. + IncrementCounter::No, + flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out, + ); + + // If there's an odd child left over, it becomes an output. + let parents_so_far = parents_array.len(); + if !parents_exact.remainder().is_empty() { + out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); + parents_so_far + 1 + } else { + parents_so_far + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement exendable ouput.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +fn compress_subtree_wide<J: Join>( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. This allows Rayon the option of multi-threading even the + // 2-chunk case, which can help performance on smaller platforms. + if input.len() <= platform.simd_degree() * CHUNK_LEN { + return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); + let (left, right) = input.split_at(left_len(input.len())); + let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let degree = if left.len() == CHUNK_LEN { + // The "simd_degree=1 and we're at the leaf nodes" case. + debug_assert_eq!(platform.simd_degree(), 1); + 1 + } else { + cmp::max(platform.simd_degree(), 2) + }; + let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); + + // Recurse! This uses multiple threads if the "rayon" feature is enabled. + let (left_n, right_n) = J::join( + || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out), + || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out), + left.len(), + right.len(), + ); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + debug_assert_eq!(left_n, degree); + debug_assert!(right_n >= 1 && right_n <= left_n); + if left_n == 1 { + out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); + return 2; + } + + // Otherwise, do one layer of parent node compression. + let num_children = left_n + right_n; + compress_parents_parallel( + &cv_array[..num_children * OUT_LEN], + key, + flags, + platform, + out, + ) +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +fn compress_subtree_to_parent_node<J: Join>( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, +) -> [u8; BLOCK_LEN] { + debug_assert!(input.len() > CHUNK_LEN); + let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let mut num_cvs = + compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array); + debug_assert!(num_cvs >= 2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; + while num_cvs > 2 { + let cv_slice = &cv_array[..num_cvs * OUT_LEN]; + num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); + cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); + } + *array_ref!(cv_array, 0, 2 * OUT_LEN) +} + +// Hash a complete input all at once. Unlike compress_subtree_wide() and +// compress_subtree_to_parent_node(), this function handles the 1 chunk case. +// Note that this we use SerialJoin here, so this is always single-threaded. +fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { + let platform = Platform::detect(); + + // If the whole subtree is one chunk, hash it directly with a ChunkState. + if input.len() <= CHUNK_LEN { + return ChunkState::new(key, 0, flags, platform) + .update(input) + .output(); + } + + // Otherwise construct an Output object from the parent node returned by + // compress_subtree_to_parent_node(). + Output { + input_chaining_value: *key, + block: compress_subtree_to_parent_node::<SerialJoin>(input, key, 0, flags, platform), + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// The default hash function. +/// +/// For an incremental version that accepts multiple writes, see [`Hasher::update`]. +/// +/// This function is always single-threaded. For multi-threading support, see +/// [`Hasher::update_with_join`]. +/// +/// [`Hasher::update`]: struct.Hasher.html#method.update +/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join +pub fn hash(input: &[u8]) -> Hash { + hash_all_at_once(input, IV, 0).root_hash() +} + +/// The keyed hash function. +/// +/// This is suitable for use as a message authentication code, for +/// example to replace an HMAC instance. +/// In that use case, the constant-time equality checking provided by +/// [`Hash`](struct.Hash.html) is almost always a security requirement, and +/// callers need to be careful not to compare MACs as raw bytes. +/// +/// This function is always single-threaded. For multi-threading support, see +/// [`Hasher::update_with_join`]. +/// +/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join +pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once(input, &key_words, KEYED_HASH).root_hash() +} + +/// The key derivation function. +/// +/// Given cryptographic key material of any length and a context string of any +/// length, this function outputs a derived subkey of any length. **The context +/// string should be hardcoded, globally unique, and application-specific.** A +/// good default format for such strings is `"[application] [commit timestamp] +/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. +/// +/// Key derivation is important when you want to use the same key in multiple +/// algorithms or use cases. Using the same key with different cryptographic +/// algorithms is generally forbidden, and deriving a separate subkey for each +/// use case protects you from bad interactions. Derived keys also mitigate the +/// damage from one part of your application accidentally leaking its key. +/// +/// As a rare exception to that general rule, however, it is possible to use +/// `derive_key` itself with key material that you are already using with +/// another algorithm. You might need to do this if you're adding features to +/// an existing application, which does not yet use key derivation internally. +/// However, you still must not share key material with algorithms that forbid +/// key reuse entirely, like a one-time pad. +/// +/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be +/// used with passwords.** Instead, use a dedicated password hash like +/// [Argon2]. Password hashes are entirely different from generic hash +/// functions, with opposite design requirements. +/// +/// This function is always single-threaded. For multi-threading support, see +/// [`Hasher::update_with_join`]. +/// +/// [`Hasher::new_derive_key`]: struct.Hasher.html#method.new_derive_key +/// [`Hasher::finalize_xof`]: struct.Hasher.html#method.finalize_xof +/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 +/// [`Hasher::update_with_join`]: struct.Hasher.html#method.update_with_join +pub fn derive_key(context: &str, key_material: &[u8], output: &mut [u8]) { + let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + let inner_output = hash_all_at_once(key_material, &context_key_words, DERIVE_KEY_MATERIAL); + OutputReader::new(inner_output).fill(output); +} + +fn parent_node_output( + left_child: &CVBytes, + right_child: &CVBytes, + key: &CVWords, + flags: u8, + platform: Platform, +) -> Output { + let mut block = [0; BLOCK_LEN]; + block[..32].copy_from_slice(left_child); + block[32..].copy_from_slice(right_child); + Output { + input_chaining_value: *key, + block, + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// An incremental hash state that can accept any number of writes. +/// +/// In addition to its inherent methods, this type implements several commonly +/// used traits from the [`digest`](https://crates.io/crates/digest) and +/// [`crypto_mac`](https://crates.io/crates/crypto-mac) crates. +/// +/// **Performance note:** The [`update`] and [`update_with_join`] methods +/// perform poorly when the caller's input buffer is small. See their method +/// docs below. A 16 KiB buffer is large enough to leverage all currently +/// supported SIMD instruction sets. +/// +/// # Examples +/// +/// ``` +/// # fn main() -> Result<(), Box<dyn std::error::Error>> { +/// // Hash an input incrementally. +/// let mut hasher = blake3::Hasher::new(); +/// hasher.update(b"foo"); +/// hasher.update(b"bar"); +/// hasher.update(b"baz"); +/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); +/// +/// // Extended output. OutputReader also implements Read and Seek. +/// # #[cfg(feature = "std")] { +/// let mut output = [0; 1000]; +/// let mut output_reader = hasher.finalize_xof(); +/// output_reader.fill(&mut output); +/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); +/// # } +/// # Ok(()) +/// # } +/// ``` +/// +/// [`update`]: #method.update +/// [`update_with_join`]: #method.update_with_join +#[derive(Clone)] +pub struct Hasher { + key: CVWords, + chunk_state: ChunkState, + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because + // we don't know whether more input is coming. This is different from how + // the reference implementation does things. + cv_stack: ArrayVec<[CVBytes; MAX_DEPTH + 1]>, +} + +impl Hasher { + fn new_internal(key: &CVWords, flags: u8) -> Self { + Self { + key: *key, + chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + cv_stack: ArrayVec::new(), + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. See + /// [`keyed_hash`]. + /// + /// [`keyed_hash`]: fn.keyed_hash.html + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. See + /// [`derive_key`]. The context string should be hardcoded, globally + /// unique, and application-specific. + /// + /// [`derive_key`]: fn.derive_key.html + pub fn new_derive_key(context: &str) -> Self { + let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) + } + + /// Reset the `Hasher` to its initial state. + /// + /// This is functionally the same as overwriting the `Hasher` with a new + /// one, using the same key or context string if any. However, depending on + /// how much inlining the optimizer does, moving a `Hasher` might copy its + /// entire CV stack, most of which is useless uninitialized bytes. This + /// methods avoids that copy. + pub fn reset(&mut self) -> &mut Self { + self.chunk_state = ChunkState::new( + &self.key, + 0, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.clear(); + self + } + + // As described in push_cv() below, we do "lazy merging", delaying merges + // until right before the next CV is about to be added. This is different + // from the reference implementation. Another difference is that we aren't + // always merging 1 chunk at a time. Instead, each CV might represent any + // power-of-two number of chunks, as long as the smaller-above-larger stack + // order is maintained. Instead of the "count the trailing 0-bits" + // algorithm described in the spec, we use a "count the total number of + // 1-bits" variant that doesn't require us to retain the subtree size of + // the CV on top of the stack. The principle is the same: each CV that + // should remain in the stack is represented by a 1-bit in the total number + // of chunks (or bytes) so far. + fn merge_cv_stack(&mut self, total_len: u64) { + let post_merge_stack_len = total_len.count_ones() as usize; + while self.cv_stack.len() > post_merge_stack_len { + let right_child = self.cv_stack.pop().unwrap(); + let left_child = self.cv_stack.pop().unwrap(); + let parent_output = parent_node_output( + &left_child, + &right_child, + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.push(parent_output.chaining_value()); + } + } + + // In reference_impl.rs, we merge the new CV with existing CVs from the + // stack before pushing it. We can do that because we know more input is + // coming, so we know none of the merges are root. + // + // This setting is different. We want to feed as much input as possible to + // compress_subtree_wide(), without setting aside anything for the + // chunk_state. If the user gives us 64 KiB, we want to parallelize over + // all 64 KiB at once as a single subtree, if at all possible. + // + // This leads to two problems: + // 1) This 64 KiB input might be the only call that ever gets made to + // update. In this case, the root node of the 64 KiB subtree would be + // the root node of the whole tree, and it would need to be ROOT + // finalized. We can't compress it until we know. + // 2) This 64 KiB input might complete a larger tree, whose root node is + // similarly going to be the the root of the whole tree. For example, + // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't + // compress the node at the root of the 256 KiB subtree until we know + // how to finalize it. + // + // The second problem is solved with "lazy merging". That is, when we're + // about to add a CV to the stack, we don't merge it with anything first, + // as the reference impl does. Instead we do merges using the *previous* CV + // that was added, which is sitting on top of the stack, and we put the new + // CV (unmerged) on top of the stack afterwards. This guarantees that we + // never merge the root node until finalize(). + // + // Solving the first problem requires an additional tool, + // compress_subtree_to_parent_node(). That function always returns the top + // *two* chaining values of the subtree it's compressing. We then do lazy + // merging with each of them separately, so that the second CV will always + // remain unmerged. (That also helps us support extendable output when + // we're hashing an input all-at-once.) + fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { + self.merge_cv_stack(chunk_counter); + self.cv_stack.push(*new_cv); + } + + /// Add input bytes to the hash state. You can call this any number of + /// times. + /// + /// This method is always single-threaded. For multi-threading support, see + /// `update_with_join` below. + /// + /// Note that the degree of SIMD parallelism that `update` can use is + /// limited by the size of this input buffer. The 8 KiB buffer currently + /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but + /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to + /// leverage all currently supported SIMD instruction sets. + /// + /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::<SerialJoin>(input) + } + + /// Add input bytes to the hash state, as with `update`, but potentially + /// using multi-threading. See the example below, and the + /// [`join`](join/index.html) module for a more detailed explanation. + /// + /// To get any performance benefit from multi-threading, the input buffer + /// size needs to be very large. As a rule of thumb on x86_64, there is no + /// benefit to multi-threading inputs less than 128 KiB. Other platforms + /// have different thresholds, and in general you need to benchmark your + /// specific use case. Where possible, memory mapping an entire input file + /// is recommended, to take maximum advantage of multi-threading without + /// needing to tune a specific buffer size. Where memory mapping is not + /// possible, good multi-threading performance requires doing IO on a + /// background thread, to avoid sleeping all your worker threads while the + /// input buffer is (serially) refilled. This is quite complicated compared + /// to memory mapping. + /// + /// # Example + /// + /// ``` + /// // Hash a large input using multi-threading. Note that multi-threading + /// // comes with some overhead, and it can actually hurt performance for small + /// // inputs. The meaning of "small" varies, however, depending on the + /// // platform and the number of threads. (On x86_64, the cutoff tends to be + /// // around 128 KiB.) You should benchmark your own use case to see whether + /// // multi-threading helps. + /// # #[cfg(feature = "rayon")] + /// # { + /// # fn some_large_input() -> &'static [u8] { b"foo" } + /// let input: &[u8] = some_large_input(); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_with_join::<blake3::join::RayonJoin>(input); + /// let hash = hasher.finalize(); + /// # } + /// ``` + pub fn update_with_join<J: Join>(&mut self, mut input: &[u8]) -> &mut Self { + // If we have some partial chunk bytes in the internal chunk_state, we + // need to finish that chunk first. + if self.chunk_state.len() > 0 { + let want = CHUNK_LEN - self.chunk_state.len(); + let take = cmp::min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + if !input.is_empty() { + // We've filled the current chunk, and there's more input + // coming, so we know it's not the root and we can finalize it. + // Then we'll proceed to hashing whole chunks below. + debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); + let chunk_cv = self.chunk_state.output().chaining_value(); + self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.chunk_state = ChunkState::new( + &self.key, + self.chunk_state.chunk_counter + 1, + self.chunk_state.flags, + self.chunk_state.platform, + ); + } else { + return self; + } + } + + // Now the chunk_state is clear, and we have more input. If there's + // more than a single chunk (so, definitely not the root chunk), hash + // the largest whole subtree we can, with the full benefits of SIMD and + // multi-threading parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees + // along the right edge can be incomplete, and we don't know where + // the right edge is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until + // this point (if total is not 0). If the current incomplete subtree + // is only waiting for 1 more chunk, we can't hash a subtree of 4 + // chunks. We have to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or + // to evenly divide what we already have, this part runs in a loop. + while input.len() > CHUNK_LEN { + debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); + debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); + let mut subtree_len = largest_power_of_two_leq(input.len()); + let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; + // Shrink the subtree_len until it evenly divides the count so far. + // We know that subtree_len itself is a power of 2, so we can use a + // bitmasking trick instead of an actual remainder operation. (Note + // that if the caller consistently passes power-of-2 inputs of the + // same size, as is hopefully typical, this loop condition will + // always fail, and subtree_len will always be the full length of + // the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. + // For example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, + // we'll still get the right answer in the end, and we might get to + // use 2-way SIMD parallelism. The problem with this optimization, + // is that it gets us stuck always hashing 2 chunks. The total + // number of chunks will remain odd, and we'll never graduate to + // higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while (subtree_len - 1) as u64 & count_so_far != 0 { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash + // that one chunk by itself. Otherwise, compress the subtree into a + // pair of CVs. + let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; + if subtree_len <= CHUNK_LEN { + debug_assert_eq!(subtree_len, CHUNK_LEN); + self.push_cv( + &ChunkState::new( + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ) + .update(&input[..subtree_len]) + .output() + .chaining_value(), + self.chunk_state.chunk_counter, + ); + } else { + // This is the high-performance happy path, though getting here + // depends on the caller giving us a long enough input. + let cv_pair = compress_subtree_to_parent_node::<J>( + &input[..subtree_len], + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ); + let left_cv = array_ref!(cv_pair, 0, 32); + let right_cv = array_ref!(cv_pair, 32, 32); + // Push the two CVs we received into the CV stack in order. Because + // the stack merges lazily, this guarantees we aren't merging the + // root. + self.push_cv(left_cv, self.chunk_state.chunk_counter); + self.push_cv( + right_cv, + self.chunk_state.chunk_counter + (subtree_chunks / 2), + ); + } + self.chunk_state.chunk_counter += subtree_chunks; + input = &input[subtree_len..]; + } + + // What remains is 1 chunk or less. Add it to the chunk state. + debug_assert!(input.len() <= CHUNK_LEN); + if !input.is_empty() { + self.chunk_state.update(input); + // Having added some input to the chunk_state, we know what's in + // the CV stack won't become the root node, and we can do an extra + // merge. This simplifies finalize(). + self.merge_cv_stack(self.chunk_state.chunk_counter); + } + + self + } + + fn final_output(&self) -> Output { + // If the current chunk is the only chunk, that makes it the root node + // also. Convert it directly into an Output. Otherwise, we need to + // merge subtrees below. + if self.cv_stack.is_empty() { + debug_assert_eq!(self.chunk_state.chunk_counter, 0); + return self.chunk_state.output(); + } + + // If there are any bytes in the ChunkState, finalize that chunk and + // merge its CV with everything in the CV stack. In that case, the work + // we did at the end of update() above guarantees that the stack + // doesn't contain any unmerged subtrees that need to be merged first. + // (This is important, because if there were two chunk hashes sitting + // on top of the stack, they would need to merge with each other, and + // merging a new chunk hash into them would be incorrect.) + // + // If there are no bytes in the ChunkState, we'll merge what's already + // in the stack. In this case it's fine if there are unmerged chunks on + // top, because we'll merge them with each other. Note that the case of + // the empty chunk is taken care of above. + let mut output: Output; + let mut num_cvs_remaining = self.cv_stack.len(); + if self.chunk_state.len() > 0 { + debug_assert_eq!( + self.cv_stack.len(), + self.chunk_state.chunk_counter.count_ones() as usize, + "cv stack does not need a merge" + ); + output = self.chunk_state.output(); + } else { + debug_assert!(self.cv_stack.len() >= 2); + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 2], + &self.cv_stack[num_cvs_remaining - 1], + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 2; + } + while num_cvs_remaining > 0 { + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 1], + &output.chaining_value(), + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 1; + } + output + } + + /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of + /// the input. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + pub fn finalize(&self) -> Hash { + self.final_output().root_hash() + } + + /// Finalize the hash state and return an [`OutputReader`], which can + /// supply any number of output bytes. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + /// + /// [`OutputReader`]: struct.OutputReader.html + pub fn finalize_xof(&self) -> OutputReader { + OutputReader::new(self.final_output()) + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for Hasher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Hasher") + .field("flags", &self.chunk_state.flags) + .field("platform", &self.chunk_state.platform) + .finish() + } +} + +impl Default for Hasher { + #[inline] + fn default() -> Self { + Self::new() + } +} + +#[cfg(feature = "std")] +impl std::io::Write for Hasher { + /// This is equivalent to [`update`](#method.update). + #[inline] + fn write(&mut self, input: &[u8]) -> std::io::Result<usize> { + self.update(input); + Ok(input.len()) + } + + #[inline] + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +/// An incremental reader for extended output, returned by +/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). +#[derive(Clone)] +pub struct OutputReader { + inner: Output, + position_within_block: u8, +} + +impl OutputReader { + fn new(inner: Output) -> Self { + Self { + inner, + position_within_block: 0, + } + } + + /// Fill a buffer with output bytes and advance the position of the + /// `OutputReader`. This is equivalent to [`Read::read`], except that it + /// doesn't return a `Result`. Both methods always fill the entire buffer. + /// + /// Note that `OutputReader` doesn't buffer output bytes internally, so + /// calling `fill` repeatedly with a short-length or odd-length slice will + /// end up performing the same compression multiple times. If you're + /// reading output in a loop, prefer a slice length that's a multiple of + /// 64. + /// + /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try + /// to extract more than that, for example by seeking near the end and + /// reading further, the behavior is unspecified. + /// + /// [`Read::read`]: #method.read + pub fn fill(&mut self, mut buf: &mut [u8]) { + while !buf.is_empty() { + let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); + let output_bytes = &block[self.position_within_block as usize..]; + let take = cmp::min(buf.len(), output_bytes.len()); + buf[..take].copy_from_slice(&output_bytes[..take]); + buf = &mut buf[take..]; + self.position_within_block += take as u8; + if self.position_within_block == BLOCK_LEN as u8 { + self.inner.counter += 1; + self.position_within_block = 0; + } + } + } + + /// Return the current read position in the output stream. The position of + /// a new `OutputReader` starts at 0, and each call to [`fill`] or + /// [`Read::read`] moves the position forward by the number of bytes read. + /// + /// [`fill`]: #method.fill + /// [`Read::read`]: #method.read + pub fn position(&self) -> u64 { + self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 + } + + /// Seek to a new read position in the output stream. This is equivalent to + /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't + /// return a `Result`. + /// + /// [`Seek::seek`]: #method.seek + /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html + pub fn set_position(&mut self, position: u64) { + self.position_within_block = (position % BLOCK_LEN as u64) as u8; + self.inner.counter = position / BLOCK_LEN as u64; + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for OutputReader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("OutputReader") + .field("position", &self.position()) + .finish() + } +} + +#[cfg(feature = "std")] +impl std::io::Read for OutputReader { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { + self.fill(buf); + Ok(buf.len()) + } +} + +#[cfg(feature = "std")] +impl std::io::Seek for OutputReader { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> { + let max_position = u64::max_value() as i128; + let target_position: i128 = match pos { + std::io::SeekFrom::Start(x) => x as i128, + std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, + std::io::SeekFrom::End(_) => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek from end not supported", + )); + } + }; + if target_position < 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek before start", + )); + } + self.set_position(cmp::min(target_position, max_position) as u64); + Ok(self.position()) + } +} diff --git a/3rdparty/BLAKE3/src/platform.rs b/3rdparty/BLAKE3/src/platform.rs new file mode 100644 index 000000000..4bd67de7a --- /dev/null +++ b/3rdparty/BLAKE3/src/platform.rs @@ -0,0 +1,487 @@ +use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; +use arrayref::{array_mut_ref, array_ref}; + +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE: usize = 16; + } else { + pub const MAX_SIMD_DEGREE: usize = 8; + } + } + } else if #[cfg(feature = "neon")] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else { + pub const MAX_SIMD_DEGREE: usize = 1; + } +} + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently +// allowed to use cmp::max, so we have to hardcode this additional constant +// value. Get rid of this once cmp::max is a const fn. +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 16; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 8; + } + } + } else if #[cfg(feature = "neon")] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE2, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX512, + #[cfg(feature = "neon")] + NEON, +} + +impl Platform { + #[allow(unreachable_code)] + pub fn detect() -> Self { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + #[cfg(blake3_avx512_ffi)] + { + if avx512_detected() { + return Platform::AVX512; + } + } + if avx2_detected() { + return Platform::AVX2; + } + if sse41_detected() { + return Platform::SSE41; + } + if sse2_detected() { + return Platform::SSE2; + } + } + // We don't use dynamic feature detection for NEON. If the "neon" + // feature is on, NEON is assumed to be supported. + #[cfg(feature = "neon")] + { + return Platform::NEON; + } + Platform::Portable + } + + pub fn simd_degree(&self) -> usize { + let degree = match self { + Platform::Portable => 1, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => 8, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => 16, + #[cfg(feature = "neon")] + Platform::NEON => 4, + }; + debug_assert!(degree <= MAX_SIMD_DEGREE); + degree + } + + pub fn compress_in_place( + &self, + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) + }, + // No NEON compress_in_place() implementation yet. + #[cfg(feature = "neon")] + Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), + } + } + + pub fn compress_xof( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) -> [u8; 64] { + match self { + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_xof(cv, block, block_len, counter, flags) + }, + // No NEON compress_xof() implementation yet. + #[cfg(feature = "neon")] + Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), + } + } + + // IMPLEMENTATION NOTE + // =================== + // hash_many() applies two optimizations. The critically important + // optimization is the high-performance parallel SIMD hashing mode, + // described in detail in the spec. This more than doubles throughput per + // thread. Another optimization is keeping the state vectors transposed + // from block to block within a chunk. When state vectors are transposed + // after every block, there's a small but measurable performance loss. + // Compressing chunks with a dedicated loop avoids this. + + pub fn hash_many<A: arrayvec::Array<Item = u8>>( + &self, + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], + ) { + match self { + Platform::Portable => portable::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => unsafe { + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + crate::avx2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "neon" feature is on. + #[cfg(feature = "neon")] + Platform::NEON => unsafe { + crate::neon::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + } + } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse2() -> Option<Self> { + if sse2_detected() { + Some(Self::SSE2) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option<Self> { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option<Self> { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option<Self> { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(feature = "neon")] + pub fn neon() -> Option<Self> { + // Assumed to be safe if the "neon" feature is on. + Some(Self::NEON) + } +} + +// Note that AVX-512 is divided into multiple featuresets, and we use two of +// them, F and VL. +#[cfg(blake3_avx512_ffi)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx512_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx512") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "avx2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn sse41_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse41") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse4.1")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse2") { + return true; + } + } + false +} + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); + out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); + out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); + out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); + out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); + out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); + out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); + out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); + *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); + *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); + *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); + *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); + *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); + *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); + *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); + out +} diff --git a/3rdparty/BLAKE3/src/portable.rs b/3rdparty/BLAKE3/src/portable.rs new file mode 100644 index 000000000..0a569cec7 --- /dev/null +++ b/3rdparty/BLAKE3/src/portable.rs @@ -0,0 +1,198 @@ +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref}; + +#[inline(always)] +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u32; 16] { + let block_words = crate::platform::words_from_le_bytes_64(block); + + let mut state = [ + cv[0], + cv[1], + cv[2], + cv[3], + cv[4], + cv[5], + cv[6], + cv[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ]; + + round(&mut state, &block_words, 0); + round(&mut state, &block_words, 1); + round(&mut state, &block_words, 2); + round(&mut state, &block_words, 3); + round(&mut state, &block_words, 4); + round(&mut state, &block_words, 5); + round(&mut state, &block_words, 6); + + state +} + +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let state = compress_pre(cv, block, block_len, counter, flags); + + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut state = compress_pre(cv, block, block_len, counter, flags); + state[0] ^= state[8]; + state[1] ^= state[9]; + state[2] ^= state[10]; + state[3] ^= state[11]; + state[4] ^= state[12]; + state[5] ^= state[13]; + state[6] ^= state[14]; + state[7] ^= state[15]; + state[8] ^= cv[0]; + state[9] ^= cv[1]; + state[10] ^= cv[2]; + state[11] ^= cv[3]; + state[12] ^= cv[4]; + state[13] ^= cv[5]; + state[14] ^= cv[6]; + state[15] ^= cv[7]; + crate::platform::le_bytes_from_words_64(&state) +} + +pub fn hash1<A: arrayvec::Array<Item = u8>>( + input: &A, + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = input.as_slice(); + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = crate::platform::le_bytes_from_words_32(&cv); +} + +pub fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +pub mod test { + use super::*; + + // This is basically testing the portable implementation against itself, + // but it also checks that compress_in_place and compress_xof are + // consistent. And there are tests against the reference implementation and + // against hardcoded test vectors elsewhere. + #[test] + fn test_compress() { + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + // Ditto. + #[test] + fn test_hash_many() { + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/rust_avx2.rs b/3rdparty/BLAKE3/src/rust_avx2.rs new file mode 100644 index 000000000..6ab773ad4 --- /dev/null +++ b/3rdparty/BLAKE3/src/rust_avx2.rs @@ -0,0 +1,474 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; +use arrayref::{array_mut_ref, mut_array_refs}; + +pub const DEGREE: usize = 8; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m256i { + _mm256_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { + _mm256_setr_epi32( + a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, + ) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. + let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. + let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); + let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); + let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); + let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); + + vecs[0] = abcdefgh_0; + vecs[1] = abcdefgh_1; + vecs[2] = abcdefgh_2; + vecs[3] = abcdefgh_3; + vecs[4] = abcdefgh_4; + vecs[5] = abcdefgh_5; + vecs[6] = abcdefgh_6; + vecs[7] = abcdefgh_7; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set8( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), + ), + set8( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), + ), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash8( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&mut h_vecs); + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + mut inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = A::CAPACITY / BLOCK_LEN; + hash8( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::avx2_detected() { + return; + } + + #[target_feature(enable = "avx2")] + unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/rust_sse2.rs b/3rdparty/BLAKE3/src/rust_sse2.rs new file mode 100644 index 000000000..15b52ee5d --- /dev/null +++ b/3rdparty/BLAKE3/src/rust_sse2.rs @@ -0,0 +1,775 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + let mut mask = _mm_set1_epi16(imm8 as i16); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse2")] +unsafe fn hash1<A: arrayvec::Array<Item = u8>>( + input: &A, + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = input.as_slice(); + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + mut inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = A::CAPACITY / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse2_detected() { + return; + } + + #[target_feature(enable = "sse2")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/rust_sse41.rs b/3rdparty/BLAKE3/src/rust_sse41.rs new file mode 100644 index 000000000..d5cf0f4a9 --- /dev/null +++ b/3rdparty/BLAKE3/src/rust_sse41.rs @@ -0,0 +1,766 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn hash1<A: arrayvec::Array<Item = u8>>( + input: &A, + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = input.as_slice(); + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + mut inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = A::CAPACITY / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse41_detected() { + return; + } + + #[target_feature(enable = "sse4.1")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/3rdparty/BLAKE3/src/test.rs b/3rdparty/BLAKE3/src/test.rs new file mode 100644 index 000000000..eefb1a354 --- /dev/null +++ b/3rdparty/BLAKE3/src/test.rs @@ -0,0 +1,569 @@ +use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::array_ref; +use arrayvec::ArrayVec; +use core::sync::atomic::{AtomicUsize, Ordering}; +use core::usize; +use rand::prelude::*; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: CVWords = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largets prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +type CompressInPlaceFn = + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe fn( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64]; + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; + + let portable_out = + crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); + + let mut test_state = initial_state; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; + let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); + let test_xof = + unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +type HashManyFn<A> = unsafe fn( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn( + hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, + hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, +) { + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + // A counter just prior to u32::MAX. + let counter = (1u64 << 32) - 1; + + // First hash chunks. + let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &chunks, + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut portable_chunks_out, + ); + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_chunks_fn( + &chunks[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut test_chunks_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<[&[u8; 2 * OUT_LEN]; NUM_INPUTS]>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &parents, + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut portable_parents_out, + ); + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_parents_fn( + &parents[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut test_parents_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } +} + +#[test] +fn test_key_bytes_equal_key_words() { + assert_eq!( + TEST_KEY_WORDS, + crate::platform::words_from_le_bytes_32(&TEST_KEY), + ); +} + +#[test] +fn test_reference_impl_size() { + // Because the Rust compiler optimizes struct layout, it's possible that + // some future version of the compiler will produce a different size. If + // that happens, we can either disable this test, or test for multiple + // expected values. For now, the purpose of this test is to make sure we + // notice if that happens. + assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>()); +} + +#[test] +fn test_counter_words() { + let counter: u64 = (1 << 32) + 2; + assert_eq!(crate::counter_low(counter), 2); + assert_eq!(crate::counter_high(counter), 1); +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(crate::left_len(input), output); + } +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + #[cfg(feature = "std")] + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::hash(input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new(); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended[..], expected_out[..]); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::keyed_hash(&TEST_KEY, input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended[..], expected_out[..]); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let mut test_out = [0; OUT]; + crate::derive_key(context, input, &mut test_out); + assert_eq!(test_out[..], expected_out[..]); + // incremental + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended[..], expected_out[..]); + } + } +} + +fn reference_hash(input: &[u8]) -> crate::Hash { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; 32]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let expected = reference_hash(total_input); + assert_eq!(expected, test_hasher.finalize()); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + #[cfg(feature = "std")] + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.gen_range(0, INPUT_MAX + 1); + #[cfg(feature = "std")] + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + assert_eq!(expected, hasher.finalize()); + } +} + +#[test] +fn test_xof_seek() { + let mut out = [0; 533]; + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + hasher.finalize_xof().fill(&mut out); + assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); + + let mut reader = hasher.finalize_xof(); + reader.set_position(303); + let mut out2 = [0; 102]; + reader.fill(&mut out2); + assert_eq!(&out[303..][..102], &out2[..]); + + #[cfg(feature = "std")] + { + use std::io::prelude::*; + let mut reader = hasher.finalize_xof(); + reader.seek(std::io::SeekFrom::Start(303)).unwrap(); + let mut out3 = Vec::new(); + reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); + assert_eq!(&out[303..][..102], &out3[..]); + + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 + ); + reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + ); + let mut out4 = [0; 17]; + assert_eq!(reader.read(&mut out4).unwrap(), 17); + assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + 17 + ); + assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); + assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); + } +} + +#[test] +fn test_msg_schdule_permutation() { + let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + + let mut generated = [[0; 16]; 7]; + generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + + for round in 1..7 { + for i in 0..16 { + generated[round][i] = generated[round - 1][permutation[i]]; + } + } + + assert_eq!(generated, crate::MSG_SCHEDULE); +} + +#[test] +fn test_reset() { + let mut hasher = crate::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); + + let key = &[99; crate::KEY_LEN]; + let mut keyed_hasher = crate::Hasher::new_keyed(key); + keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); + keyed_hasher.reset(); + keyed_hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!( + keyed_hasher.finalize(), + crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), + ); + + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut kdf = crate::Hasher::new_derive_key(context); + kdf.update(&[42; 3 * CHUNK_LEN + 7]); + kdf.reset(); + kdf.update(&[42; CHUNK_LEN + 3]); + let mut expected = [0; crate::OUT_LEN]; + crate::derive_key(context, &[42; CHUNK_LEN + 3], &mut expected); + assert_eq!(kdf.finalize(), expected); +} + +#[test] +#[cfg(feature = "rayon")] +fn test_update_with_rayon_join() { + let mut input = [0; TEST_CASES_MAX]; + paint_test_input(&mut input); + let rayon_hash = crate::Hasher::new() + .update_with_join::<crate::join::RayonJoin>(&input) + .finalize(); + assert_eq!(crate::hash(&input), rayon_hash); +} + +// Test that the length values given to Join::join are what they're supposed to +// be. +#[test] +fn test_join_lengths() { + // Use static atomics to let us safely get a couple of values in and out of + // CustomJoin. This avoids depending on std, though it assumes that this + // thread will only run once in the lifetime of the runner process. + static SINGLE_THREAD_LEN: AtomicUsize = AtomicUsize::new(0); + static CUSTOM_JOIN_CALLS: AtomicUsize = AtomicUsize::new(0); + + // Use an input that's exactly (simd_degree * CHUNK_LEN) + 1. That should + // guarantee that compress_subtree_wide does exactly one split, with the + // last byte on the right side. Note that it we used + // Hasher::update_with_join, we would end up buffering that last byte, + // rather than splitting and joining it. + let single_thread_len = crate::platform::Platform::detect().simd_degree() * CHUNK_LEN; + SINGLE_THREAD_LEN.store(single_thread_len, Ordering::SeqCst); + let mut input_buf = [0; 2 * crate::platform::MAX_SIMD_DEGREE * CHUNK_LEN]; + paint_test_input(&mut input_buf); + let input = &input_buf[..single_thread_len + 1]; + + enum CustomJoin {} + + impl crate::join::Join for CustomJoin { + fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + let prev_calls = CUSTOM_JOIN_CALLS.fetch_add(1, Ordering::SeqCst); + assert_eq!(prev_calls, 0); + assert_eq!(len_a, SINGLE_THREAD_LEN.load(Ordering::SeqCst)); + assert_eq!(len_b, 1); + (oper_a(), oper_b()) + } + } + + let mut out_buf = [0; crate::platform::MAX_SIMD_DEGREE_OR_2 * CHUNK_LEN]; + crate::compress_subtree_wide::<CustomJoin>( + input, + crate::IV, + 0, + 0, + crate::platform::Platform::detect(), + &mut out_buf, + ); + assert_eq!(CUSTOM_JOIN_CALLS.load(Ordering::SeqCst), 1); +} diff --git a/3rdparty/BLAKE3/src/traits.rs b/3rdparty/BLAKE3/src/traits.rs new file mode 100644 index 000000000..9704e0106 --- /dev/null +++ b/3rdparty/BLAKE3/src/traits.rs @@ -0,0 +1,184 @@ +//! Implementations of commonly used traits like +//! [`digest::Digest`](https://crates.io/crates/digest) and +//! [`crypto_mac::Mac`](https://crates.io/crates/crypto-mac). + +pub use crypto_mac; +pub use digest; + +use crate::{Hasher, OutputReader}; +use digest::generic_array::{ + typenum::{U32, U64}, + GenericArray, +}; + +impl digest::BlockInput for Hasher { + type BlockSize = U64; +} + +impl digest::Update for Hasher { + #[inline] + fn update(&mut self, data: impl AsRef<[u8]>) { + self.update(data.as_ref()); + } +} + +impl digest::Reset for Hasher { + #[inline] + fn reset(&mut self) { + self.reset(); // the inherent method + } +} + +impl digest::FixedOutput for Hasher { + type OutputSize = U32; + + #[inline] + fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) { + out.copy_from_slice(self.finalize().as_bytes()); + } + + #[inline] + fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) { + out.copy_from_slice(self.finalize().as_bytes()); + self.reset(); + } +} + +impl digest::ExtendableOutput for Hasher { + type Reader = OutputReader; + + #[inline] + fn finalize_xof(self) -> Self::Reader { + Hasher::finalize_xof(&self) + } + + #[inline] + fn finalize_xof_reset(&mut self) -> Self::Reader { + let reader = Hasher::finalize_xof(self); + self.reset(); + reader + } +} + +impl digest::XofReader for OutputReader { + #[inline] + fn read(&mut self, buffer: &mut [u8]) { + self.fill(buffer); + } +} + +impl crypto_mac::NewMac for Hasher { + type KeySize = U32; + + #[inline] + fn new(key: &crypto_mac::Key<Self>) -> Self { + let key_bytes: [u8; 32] = (*key).into(); + Hasher::new_keyed(&key_bytes) + } +} + +impl crypto_mac::Mac for Hasher { + type OutputSize = U32; + + #[inline] + fn update(&mut self, data: &[u8]) { + self.update(data); + } + + #[inline] + fn reset(&mut self) { + self.reset(); + } + + #[inline] + fn finalize(self) -> crypto_mac::Output<Self> { + crypto_mac::Output::new(digest::Digest::finalize(self)) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_digest_traits() { + // Inherent methods. + let mut hasher1 = crate::Hasher::new(); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + let mut xof1 = [0; 301]; + hasher1.finalize_xof().fill(&mut xof1); + assert_eq!(out1.as_bytes(), &xof1[..32]); + + // Trait implementations. + let mut hasher2: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher2, b"xxx"); + digest::Digest::reset(&mut hasher2); + digest::Digest::update(&mut hasher2, b"foo"); + digest::Digest::update(&mut hasher2, b"bar"); + digest::Digest::update(&mut hasher2, b"baz"); + let out2 = digest::Digest::finalize(hasher2.clone()); + let mut xof2 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), + &mut xof2, + ); + assert_eq!(out1.as_bytes(), &out2[..]); + assert_eq!(xof1[..], xof2[..]); + + // Again with the resetting variants. + let mut hasher3: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out3 = [0; 32]; + digest::FixedOutput::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out3), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out4 = [0; 32]; + digest::FixedOutput::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out4), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof3 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), + &mut xof3, + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof4 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutput::finalize_xof_reset(&mut hasher3), + &mut xof4, + ); + assert_eq!(out1.as_bytes(), &out3[..]); + assert_eq!(out1.as_bytes(), &out4[..]); + assert_eq!(xof1[..], xof3[..]); + assert_eq!(xof1[..], xof4[..]); + } + + #[test] + fn test_mac_trait() { + // Inherent methods. + let key = b"some super secret key bytes fooo"; + let mut hasher1 = crate::Hasher::new_keyed(key); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + + // Trait implementation. + let generic_key = (*key).into(); + let mut hasher2: crate::Hasher = crypto_mac::NewMac::new(&generic_key); + crypto_mac::Mac::update(&mut hasher2, b"xxx"); + crypto_mac::Mac::reset(&mut hasher2); + crypto_mac::Mac::update(&mut hasher2, b"foo"); + crypto_mac::Mac::update(&mut hasher2, b"bar"); + crypto_mac::Mac::update(&mut hasher2, b"baz"); + let out2 = crypto_mac::Mac::finalize(hasher2); + assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); + } +} diff --git a/3rdparty/BLAKE3/test_vectors/Cargo.toml b/3rdparty/BLAKE3/test_vectors/Cargo.toml new file mode 100644 index 000000000..cd74a9df0 --- /dev/null +++ b/3rdparty/BLAKE3/test_vectors/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "test_vectors" +version = "0.0.0" +edition = "2018" + +[features] +neon = ["blake3/neon"] +prefer_intrinsics = ["blake3/prefer_intrinsics"] +pure = ["blake3/pure"] + +[dependencies] +# If you ever change these path dependencies, you'll probably need to update +# cross_test.sh, or CI will break. I'm sorry >.< +blake3 = { path = "../" } +hex = "0.4.0" +reference_impl = { path = "../reference_impl" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/3rdparty/BLAKE3/test_vectors/cross_test.sh b/3rdparty/BLAKE3/test_vectors/cross_test.sh new file mode 100644 index 000000000..c4d280c9d --- /dev/null +++ b/3rdparty/BLAKE3/test_vectors/cross_test.sh @@ -0,0 +1,25 @@ +#! /usr/bin/env bash + +# This hacky script works around the fact that `cross test` does not support +# path dependencies. (It uses a docker shared folder to let the guest access +# project files, so parent directories aren't available.) Solve this problem by +# copying the entire project to a temp dir and rearranging paths to put +# "blake3" and "reference_impl" underneath "test_vectors", so that everything +# is accessible. Hopefully this will just run on CI forever and no one will +# ever read this and discover my deep shame. + +set -e -u -o pipefail + +project_root="$(realpath "$(dirname "$BASH_SOURCE")/..")" +tmpdir="$(mktemp -d)" +echo "Running cross tests in $tmpdir" +cd "$tmpdir" +git clone "$project_root" blake3 +mv blake3/test_vectors . +mv blake3/reference_impl test_vectors +mv blake3 test_vectors +cd test_vectors +sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml +sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml + +cross test "$@" diff --git a/3rdparty/BLAKE3/test_vectors/src/lib.rs b/3rdparty/BLAKE3/test_vectors/src/lib.rs new file mode 100644 index 000000000..04460f668 --- /dev/null +++ b/3rdparty/BLAKE3/test_vectors/src/lib.rs @@ -0,0 +1,349 @@ +use blake3::{BLOCK_LEN, CHUNK_LEN}; +use serde::{Deserialize, Serialize}; + +// A non-multiple of 4 is important, since one possible bug is to fail to emit +// partial words. +pub const OUTPUT_LEN: usize = 2 * blake3::BLOCK_LEN + 3; + +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_KEY: &[u8; blake3::KEY_LEN] = b"whats the Elvish word for friend"; +pub const TEST_CONTEXT: &str = "BLAKE3 2019-12-27 16:29:52 test vectors context"; + +const COMMENT: &str = r#" +Each test is an input length and three outputs, one for each of the hash, +keyed_hash, and derive_key modes. The input in each case is filled with a +repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. +The key used with keyed_hash is the 32-byte ASCII string "whats the Elvish word +for friend", also given in the `key` field below. The context string used with +derive_key is the ASCII string "BLAKE3 2019-12-27 16:29:52 test vectors +context", also given in the `context_string` field below. Outputs are encoded +as hexadecimal. Each case is an extended output, and implementations should +also check that the first 32 bytes match their default-length output. +"#; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largets prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Cases { + pub _comment: String, + pub key: String, + pub context_string: String, + pub cases: Vec<Case>, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Case { + pub input_len: usize, + pub hash: String, + pub keyed_hash: String, + pub derive_key: String, +} + +pub fn generate_json() -> String { + let mut cases = Vec::new(); + for &input_len in TEST_CASES { + let mut input = vec![0; input_len]; + paint_test_input(&mut input); + + let mut hash_out = [0; OUTPUT_LEN]; + blake3::Hasher::new() + .update(&input) + .finalize_xof() + .fill(&mut hash_out); + + let mut keyed_hash_out = [0; OUTPUT_LEN]; + blake3::Hasher::new_keyed(TEST_KEY) + .update(&input) + .finalize_xof() + .fill(&mut keyed_hash_out); + + let mut derive_key_out = [0; OUTPUT_LEN]; + blake3::Hasher::new_derive_key(TEST_CONTEXT) + .update(&input) + .finalize_xof() + .fill(&mut derive_key_out); + + cases.push(Case { + input_len, + hash: hex::encode(&hash_out[..]), + keyed_hash: hex::encode(&keyed_hash_out[..]), + derive_key: hex::encode(&derive_key_out[..]), + }); + } + + let mut json = serde_json::to_string_pretty(&Cases { + _comment: COMMENT.trim().replace("\n", " "), + key: std::str::from_utf8(TEST_KEY).unwrap().to_string(), + context_string: TEST_CONTEXT.to_string(), + cases, + }) + .unwrap(); + + // Add a trailing newline. + json.push('\n'); + json +} + +pub fn read_test_vectors_file() -> String { + let test_vectors_file_path = "./test_vectors.json"; + std::fs::read_to_string(test_vectors_file_path).expect("failed to read test_vectors.json") +} + +pub fn parse_test_cases() -> Cases { + let json = read_test_vectors_file(); + serde_json::from_str(&json).expect("failed to parse test_vectors.json") +} + +#[cfg(test)] +mod tests { + use super::*; + use std::convert::TryInto; + + fn test_reference_impl_all_at_once( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_hash, &out[..]); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = reference_impl::Hasher::new_keyed(key); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_derive_key, &out[..]); + } + + fn test_reference_impl_one_at_a_time( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = reference_impl::Hasher::new(); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_hash, &out[..]); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = reference_impl::Hasher::new_keyed(key); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_derive_key, &out[..]); + } + + fn test_incremental_all_at_once( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = blake3::Hasher::new(); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_hash, &out[..]); + assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = blake3::Hasher::new_keyed(key); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_derive_key, &out[..]); + assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); + } + + fn test_incremental_one_at_a_time( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = blake3::Hasher::new(); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_hash, &out[..]); + assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = blake3::Hasher::new_keyed(key); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_derive_key, &out[..]); + assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); + } + + fn test_recursive( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + assert_eq!(&expected_hash[..32], blake3::hash(input).as_bytes()); + assert_eq!( + &expected_keyed_hash[..32], + &blake3::keyed_hash(key, input).as_bytes()[..], + ); + let mut derive_key_out = vec![0; expected_derive_key.len()]; + blake3::derive_key(TEST_CONTEXT, input, &mut derive_key_out); + assert_eq!(expected_derive_key, &derive_key_out[..],); + } + + #[test] + fn run_test_vectors() { + let cases = parse_test_cases(); + let key: &[u8; blake3::KEY_LEN] = cases.key.as_bytes().try_into().unwrap(); + for case in &cases.cases { + dbg!(case.input_len); + let mut input = vec![0; case.input_len]; + paint_test_input(&mut input); + let expected_hash = hex::decode(&case.hash).unwrap(); + let expected_keyed_hash = hex::decode(&case.keyed_hash).unwrap(); + let expected_derive_key = hex::decode(&case.derive_key).unwrap(); + + test_reference_impl_all_at_once( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_reference_impl_one_at_a_time( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_incremental_all_at_once( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_incremental_one_at_a_time( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_recursive( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + } + } + + #[test] + fn test_checked_in_vectors_up_to_date() { + // Replace Windows newlines, in case Git is configured to alter + // newlines when files are checked out. + let json = read_test_vectors_file().replace("\r\n", "\n"); + if generate_json() != json { + panic!("Checked-in test_vectors.json is not up to date. Regenerate with `cargo run --bin generate > ./test_vectors.json`."); + } + } +} diff --git a/3rdparty/BLAKE3/test_vectors/test_vectors.json b/3rdparty/BLAKE3/test_vectors/test_vectors.json new file mode 100644 index 000000000..f6da91792 --- /dev/null +++ b/3rdparty/BLAKE3/test_vectors/test_vectors.json @@ -0,0 +1,217 @@ +{ + "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string \"whats the Elvish word for friend\", also given in the `key` field below. The context string used with derive_key is the ASCII string \"BLAKE3 2019-12-27 16:29:52 test vectors context\", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.", + "key": "whats the Elvish word for friend", + "context_string": "BLAKE3 2019-12-27 16:29:52 test vectors context", + "cases": [ + { + "input_len": 0, + "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", + "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", + "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" + }, + { + "input_len": 1, + "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", + "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", + "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" + }, + { + "input_len": 2, + "hash": "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a432263a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", + "keyed_hash": "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9ffbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f658be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", + "derive_key": "1f166565a7df0098ee65922d7fea425fb18b9943f19d6161e2d17939356168e6daa59cae19892b2d54f6fc9f475d26031fd1c22ae0a3e8ef7bdb23f452a15e0027629d2e867b1bb1e6ab21c71297377750826c404dfccc2406bd57a83775f89e0b075e59a7732326715ef912078e213944f490ad68037557518b79c0086de6d6f6cdd2" + }, + { + "input_len": 3, + "hash": "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cdd0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", + "keyed_hash": "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", + "derive_key": "440aba35cb006b61fc17c0529255de438efc06a8c9ebf3f2ddac3b5a86705797f27e2e914574f4d87ec04c379e12789eccbfbc15892626042707802dbe4e97c3ff59dca80c1e54246b6d055154f7348a39b7d098b2b4824ebe90e104e763b2a447512132cede16243484a55a4e40a85790038bb0dcf762e8c053cabae41bbe22a5bff7" + }, + { + "input_len": 4, + "hash": "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e65797211701dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", + "keyed_hash": "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070116c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", + "derive_key": "f46085c8190d69022369ce1a18880e9b369c135eb93f3c63550d3e7630e91060fbd7d8f4258bec9da4e05044f88b91944f7cab317a2f0c18279629a3867fad0662c9ad4d42c6f27e5b124da17c8c4f3a94a025ba5d1b623686c6099d202a7317a82e3d95dae46a87de0555d727a5df55de44dab799a20dffe239594d6e99ed17950910" + }, + { + "input_len": 5, + "hash": "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2ebcfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2ca748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c999904037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", + "keyed_hash": "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616ab199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", + "derive_key": "1f24eda69dbcb752847ec3ebb5dd42836d86e58500c7c98d906ecd82ed9ae47f6f48a3f67e4e43329c9a89b1ca526b9b35cbf7d25c1e353baffb590fd79be58ddb6c711f1a6b60e98620b851c688670412fcb0435657ba6b638d21f0f2a04f2f6b0bd8834837b10e438d5f4c7c2c71299cf7586ea9144ed09253d51f8f54dd6bff719d" + }, + { + "input_len": 6, + "hash": "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844611a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", + "keyed_hash": "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e8807800842a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", + "derive_key": "be96b30b37919fe4379dfbe752ae77b4f7e2ab92f7ff27435f76f2f065f6a5f435ae01a1d14bd5a6b3b69d8cbd35f0b01ef2173ff6f9b640ca0bd4748efa398bf9a9c0acd6a66d9332fdc9b47ffe28ba7ab6090c26747b85f4fab22f936b71eb3f64613d8bd9dfabe9bb68da19de78321b481e5297df9e40ec8a3d662f3e1479c65de0" + }, + { + "input_len": 7, + "hash": "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a941f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fef1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", + "keyed_hash": "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5fd6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", + "derive_key": "dc3b6485f9d94935329442916b0d059685ba815a1fa2a14107217453a7fc9f0e66266db2ea7c96843f9d8208e600a73f7f45b2f55b9e6d6a7ccf05daae63a3fdd10b25ac0bd2e224ce8291f88c05976d575df998477db86fb2cfbbf91725d62cb57acfeb3c2d973b89b503c2b60dde85a7802b69dc1ac2007d5623cbea8cbfb6b181f5" + }, + { + "input_len": 8, + "hash": "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb725d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a22e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", + "keyed_hash": "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305abf86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", + "derive_key": "2b166978cef14d9d438046c720519d8b1cad707e199746f1562d0c87fbd32940f0e2545a96693a66654225ebbaac76d093bfa9cd8f525a53acb92a861a98c42e7d1c4ae82e68ab691d510012edd2a728f98cd4794ef757e94d6546961b4f280a51aac339cc95b64a92b83cc3f26d8af8dfb4c091c240acdb4d47728d23e7148720ef04" + }, + { + "input_len": 63, + "hash": "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b1197012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf4687093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", + "keyed_hash": "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea05a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847abb38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f11678377483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", + "derive_key": "b6451e30b953c206e34644c6803724e9d2725e0893039cfc49584f991f451af3b89e8ff572d3da4f4022199b9563b9d70ebb616efff0763e9abec71b550f1371e233319c4c4e74da936ba8e5bbb29a598e007a0bbfa929c99738ca2cc098d59134d11ff300c39f82e2fce9f7f0fa266459503f64ab9913befc65fddc474f6dc1c67669" + }, + { + "input_len": 64, + "hash": "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7fbb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", + "keyed_hash": "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e68244c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f77a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c9255306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", + "derive_key": "a5c4a7053fa86b64746d4bb688d06ad1f02a18fce9afd3e818fefaa7126bf73e9b9493a9befebe0bf0c9509fb3105cfa0e262cde141aa8e3f2c2f77890bb64a4cca96922a21ead111f6338ad5244f2c15c44cb595443ac2ac294231e31be4a4307d0a91e874d36fc9852aeb1265c09b6e0cda7c37ef686fbbcab97e8ff66718be048bb" + }, + { + "input_len": 65, + "hash": "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", + "keyed_hash": "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b9041497de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", + "derive_key": "51fd05c3c1cfbc8ed67d139ad76f5cf8236cd2acd26627a30c104dfd9d3ff8a82b02e8bd36d8498a75ad8c8e9b15eb386970283d6dd42c8ae7911cc592887fdbe26a0a5f0bf821cd92986c60b2502c9be3f98a9c133a7e8045ea867e0828c7252e739321f7c2d65daee4468eb4429efae469a42763f1f94977435d10dccae3e3dce88d" + }, + { + "input_len": 127, + "hash": "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da47644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc44355b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", + "keyed_hash": "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd54663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", + "derive_key": "c91c090ceee3a3ac81902da31838012625bbcd73fcb92e7d7e56f78deba4f0c3feeb3974306966ccb3e3c69c337ef8a45660ad02526306fd685c88542ad00f759af6dd1adc2e50c2b8aac9f0c5221ff481565cf6455b772515a69463223202e5c371743e35210bbbbabd89651684107fd9fe493c937be16e39cfa7084a36207c99bea3" + }, + { + "input_len": 128, + "hash": "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa69faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ecba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f75e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", + "keyed_hash": "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd86bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", + "derive_key": "81720f34452f58a0120a58b6b4608384b5c51d11f39ce97161a0c0e442ca022550e7cd651e312f0b4c6afb3c348ae5dd17d2b29fab3b894d9a0034c7b04fd9190cbd90043ff65d1657bbc05bfdecf2897dd894c7a1b54656d59a50b51190a9da44db426266ad6ce7c173a8c0bbe091b75e734b4dadb59b2861cd2518b4e7591e4b83c9" + }, + { + "input_len": 129, + "hash": "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f96ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c7127bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", + "keyed_hash": "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aaee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412cd8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", + "derive_key": "938d2d4435be30eafdbb2b7031f7857c98b04881227391dc40db3c7b21f41fc18d72d0f9c1de5760e1941aebf3100b51d64644cb459eb5d20258e233892805eb98b07570ef2a1787cd48e117c8d6a63a68fd8fc8e59e79dbe63129e88352865721c8d5f0cf183f85e0609860472b0d6087cefdd186d984b21542c1c780684ed6832d8d" + }, + { + "input_len": 1023, + "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", + "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", + "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" + }, + { + "input_len": 1024, + "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", + "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", + "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" + }, + { + "input_len": 1025, + "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", + "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", + "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" + }, + { + "input_len": 2048, + "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", + "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", + "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" + }, + { + "input_len": 2049, + "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", + "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", + "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" + }, + { + "input_len": 3072, + "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", + "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", + "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" + }, + { + "input_len": 3073, + "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", + "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", + "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" + }, + { + "input_len": 4096, + "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", + "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", + "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" + }, + { + "input_len": 4097, + "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", + "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", + "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" + }, + { + "input_len": 5120, + "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", + "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", + "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" + }, + { + "input_len": 5121, + "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", + "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", + "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" + }, + { + "input_len": 6144, + "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", + "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", + "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" + }, + { + "input_len": 6145, + "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", + "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", + "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" + }, + { + "input_len": 7168, + "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", + "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", + "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" + }, + { + "input_len": 7169, + "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", + "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", + "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" + }, + { + "input_len": 8192, + "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", + "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", + "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" + }, + { + "input_len": 8193, + "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", + "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", + "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" + }, + { + "input_len": 16384, + "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", + "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", + "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" + }, + { + "input_len": 31744, + "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", + "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", + "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" + }, + { + "input_len": 102400, + "hash": "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", + "keyed_hash": "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", + "derive_key": "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560" + } + ] +} diff --git a/3rdparty/BLAKE3/tools/compiler_version/Cargo.toml b/3rdparty/BLAKE3/tools/compiler_version/Cargo.toml new file mode 100644 index 000000000..1046cf29d --- /dev/null +++ b/3rdparty/BLAKE3/tools/compiler_version/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "compiler_version" +version = "0.0.0" +edition = "2018" + +[build-dependencies] +cc = "1.0.50" diff --git a/3rdparty/BLAKE3/tools/compiler_version/build.rs b/3rdparty/BLAKE3/tools/compiler_version/build.rs new file mode 100644 index 000000000..3e14ebe67 --- /dev/null +++ b/3rdparty/BLAKE3/tools/compiler_version/build.rs @@ -0,0 +1,6 @@ +fn main() { + let build = cc::Build::new(); + let compiler = build.get_compiler(); + let compiler_path = compiler.path().to_string_lossy(); + println!("cargo:rustc-env=COMPILER_PATH={}", compiler_path); +} diff --git a/3rdparty/BLAKE3/tools/compiler_version/src/main.rs b/3rdparty/BLAKE3/tools/compiler_version/src/main.rs new file mode 100644 index 000000000..767cb31bd --- /dev/null +++ b/3rdparty/BLAKE3/tools/compiler_version/src/main.rs @@ -0,0 +1,27 @@ +use std::process::Command; + +fn main() { + // Print the rustc version. + Command::new(env!("CARGO")) + .args(&["rustc", "--quiet", "--", "--version"]) + .status() + .unwrap(); + println!(); + + // Print the Cargo version. + Command::new(env!("CARGO")) + .args(&["--version"]) + .status() + .unwrap(); + println!(); + + // Print the C compiler version. This relies on C compiler detection done + // in build.rs, which sets the COMPILER_PATH variable. + let compiler_path = env!("COMPILER_PATH"); + let mut compiler_command = Command::new(compiler_path); + // Use the --version flag on everything other than MSVC. + if !cfg!(target_env = "msvc") { + compiler_command.arg("--version"); + } + let _ = compiler_command.status().unwrap(); +} diff --git a/3rdparty/BLAKE3/tools/instruction_set_support/Cargo.toml b/3rdparty/BLAKE3/tools/instruction_set_support/Cargo.toml new file mode 100644 index 000000000..9e30174a9 --- /dev/null +++ b/3rdparty/BLAKE3/tools/instruction_set_support/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "instruction_set_support" +version = "0.0.0" +edition = "2018" + +[dependencies] diff --git a/3rdparty/BLAKE3/tools/instruction_set_support/src/main.rs b/3rdparty/BLAKE3/tools/instruction_set_support/src/main.rs new file mode 100644 index 000000000..6b509b053 --- /dev/null +++ b/3rdparty/BLAKE3/tools/instruction_set_support/src/main.rs @@ -0,0 +1,10 @@ +fn main() { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + dbg!(is_x86_feature_detected!("sse2")); + dbg!(is_x86_feature_detected!("sse4.1")); + dbg!(is_x86_feature_detected!("avx2")); + dbg!(is_x86_feature_detected!("avx512f")); + dbg!(is_x86_feature_detected!("avx512vl")); + } +} diff --git a/3rdparty/utfcpp/.circleci/config.yml b/3rdparty/utfcpp/.circleci/config.yml new file mode 100644 index 000000000..b2cbdaf99 --- /dev/null +++ b/3rdparty/utfcpp/.circleci/config.yml @@ -0,0 +1,13 @@ +version: 2 + +jobs: + build: + docker: + - image: nemtrif/utf8cpp:3.0.1 + steps: + - checkout + - run: git submodule update --init --recursive --remote + - run: mkdir build + - run: cd build && cmake .. + - run: cd build && cmake --build . + - run: cd build && ctest -VV diff --git a/3rdparty/utfcpp/.gitignore b/3rdparty/utfcpp/.gitignore new file mode 100644 index 000000000..488d51dd9 --- /dev/null +++ b/3rdparty/utfcpp/.gitignore @@ -0,0 +1,4 @@ +# VS Code: +.vscode/ +# Often used by CMake +build/
\ No newline at end of file diff --git a/3rdparty/utfcpp/.gitmodules b/3rdparty/utfcpp/.gitmodules new file mode 100644 index 000000000..d2ac8470d --- /dev/null +++ b/3rdparty/utfcpp/.gitmodules @@ -0,0 +1,3 @@ +[submodule "extern/gtest"] + path = extern/gtest + url = https://github.com/google/googletest diff --git a/3rdparty/utfcpp/CMakeLists.txt b/3rdparty/utfcpp/CMakeLists.txt new file mode 100644 index 000000000..4e63087bc --- /dev/null +++ b/3rdparty/utfcpp/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required (VERSION 3.0.2) +project (utf8cpp VERSION 3.1.2 LANGUAGES CXX) + +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(IS_ROOT_PROJECT ON) +else() + set(IS_ROOT_PROJECT OFF) +endif() + +option(UTF8_TESTS "Enable tests for UTF8-CPP" ${IS_ROOT_PROJECT}) +option(UTF8_INSTALL "Enable installation for UTF8-CPP" ${IS_ROOT_PROJECT}) +option(UTF8_SAMPLES "Enable building samples for UTF8-CPP" ${IS_ROOT_PROJECT}) + +add_library(utf8cpp INTERFACE) +target_include_directories(utf8cpp INTERFACE + "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/source>" + $<INSTALL_INTERFACE:include/utf8cpp> +) +add_library(utf8::cpp ALIAS utf8cpp) + +if(UTF8_INSTALL) + if(MSVC) + set(DEF_INSTALL_CMAKE_DIR CMake) + else() + include(GNUInstallDirs) # define CMAKE_INSTALL_* + set(DEF_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/utf8cpp) + endif() + + install(DIRECTORY source/ DESTINATION include/utf8cpp) + install(TARGETS utf8cpp EXPORT utf8cppConfig) + install(EXPORT utf8cppConfig DESTINATION ${DEF_INSTALL_CMAKE_DIR}) +endif() + +if(UTF8_SAMPLES) + add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp) + target_link_libraries(docsample PRIVATE utf8::cpp) +endif() + +if(UTF8_TESTS) + enable_testing() + add_subdirectory(extern/gtest) + add_subdirectory(tests) +endif() diff --git a/3rdparty/utfcpp/LICENSE b/3rdparty/utfcpp/LICENSE new file mode 100644 index 000000000..36b7cd93c --- /dev/null +++ b/3rdparty/utfcpp/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/3rdparty/utfcpp/README.md b/3rdparty/utfcpp/README.md new file mode 100644 index 000000000..0c689cf12 --- /dev/null +++ b/3rdparty/utfcpp/README.md @@ -0,0 +1,1302 @@ +# UTF8-CPP: UTF-8 with C++ in a Portable Way + + +## Introduction + +C++ developers miss an easy and portable way of handling Unicode encoded strings. The original C++ Standard (known as C++98 or C++03) is Unicode agnostic. C++11 provides some support for Unicode on core language and library level: u8, u, and U character and string literals, char16_t and char32_t character types, u16string and u32string library classes, and codecvt support for conversions between Unicode encoding forms. In the meantime, developers use third party libraries like ICU, OS specific capabilities, or simply roll out their own solutions. + +In order to easily handle UTF-8 encoded Unicode strings, I came up with a small, C++98 compatible generic library. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the [license](./LICENSE). The library has been used a lot in the past ten years both in commercial and open-source projects and is considered feature-complete now. If you run into bugs or performance issues, please let me know and I'll do my best to address them. + +The purpose of this article is not to offer an introduction to Unicode in general, and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out [Unicode Home Page](http://www.unicode.org/) or some other source of information for Unicode. Also, it is not my aim to advocate the use of UTF-8 encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from C++, I am sure you have good reasons for it. + +## Examples of use + +### Introductionary Sample + +To illustrate the use of the library, let's start with a small but complete program that opens a file containing UTF-8 encoded text, reads it line by line, checks each line for invalid UTF-8 byte sequences, and converts it to UTF-16 encoding and back to UTF-8: + +```cpp +#include <fstream> +#include <iostream> +#include <string> +#include <vector> +#include "utf8.h" +using namespace std; +int main(int argc, char** argv) +{ + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; + } + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) + ifstream fs8(test_file_path); + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; + } + + unsigned line_count = 1; + string line; + // Play with all the lines in the file + while (getline(fs8, line)) { + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) +#if __cplusplus >= 201103L // C++ 11 or later + auto end_it = utf8::find_invalid(line.begin(), line.end()); +#else + string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); +#endif // C++ 11 + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + } + // Get the line length (at least for the valid part) + int length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + + // Convert it to utf-16 +#if __cplusplus >= 201103L // C++ 11 or later + u16string utf16line = utf8::utf8to16(line); +#else + vector<unsigned short> utf16line; + utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); +#endif // C++ 11 + // And back to utf-8; +#if __cplusplus >= 201103L // C++ 11 or later + string utf8line = utf8::utf16to8(utf16line); +#else + string utf8line; + utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); +#endif // C++ 11 + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + + line_count++; + } + + return 0; +} +``` + +In the previous code sample, for each line we performed a detection of invalid UTF-8 sequences with `find_invalid`; the number of characters (more precisely - the number of Unicode code points, including the end of line and even BOM if there is one) in each line was determined with a use of `utf8::distance`; finally, we have converted each line to UTF-16 encoding with `utf8to16` and back to UTF-8 with `utf16to8`. + +Note a different pattern of usage for old compilers. For instance, this is how we convert +a UTF-8 encoded string to a UTF-16 encoded one with a pre - C++11 compiler: +```cpp + vector<unsigned short> utf16line; + utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); +``` + +With a more modern compiler, the same operation would look like: +```cpp + u16string utf16line = utf8::utf8to16(line); +``` +If `__cplusplus` macro points to a C++ 11 or later, the library exposes API that takes into +account C++ standard Unicode strings and move semantics. With an older compiler, it is still +possible to use the same functionality, just in a little less convenient way + +In case you do not trust the `__cplusplus` macro or, for instance, do not want to include +the C++ 11 helper functions even with a modern compiler, define `UTF_CPP_CPLUSPLUS` macro +before including `utf8.h` and assign it a value for the standard you want to use - the values are the same as for the `__cplusplus` macro. This can be also useful with compilers that are conservative in setting the `__cplusplus` macro even if they have a good support for a recent standard edition - Microsoft's Visual C++ is one example. + +### Checking if a file contains valid UTF-8 text + +Here is a function that checks whether the content of a file is valid UTF-8 encoded text without reading the content into the memory: + +```cpp +bool valid_utf8_file(const char* file_name) +{ + ifstream ifs(file_name); + if (!ifs) + return false; // even better, throw here + + istreambuf_iterator<char> it(ifs.rdbuf()); + istreambuf_iterator<char> eos; + + return utf8::is_valid(it, eos); +} +``` + +Because the function `utf8::is_valid()` works with input iterators, we were able to pass an `istreambuf_iterator` to it and read the content of the file directly without loading it to the memory first. + +Note that other functions that take input iterator arguments can be used in a similar way. For instance, to read the content of a UTF-8 encoded text file and convert the text to UTF-16, just do something like: + +```cpp + utf8::utf8to16(it, eos, back_inserter(u16string)); +``` + +### Ensure that a string contains valid UTF-8 text + +If we have some text that "probably" contains UTF-8 encoded text and we want to replace any invalid UTF-8 sequence with a replacement character, something like the following function may be used: + +```cpp +void fix_utf8_string(std::string& str) +{ + std::string temp; + utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp)); + str = temp; +} +``` + +The function will replace any invalid UTF-8 sequence with a Unicode replacement character. There is an overloaded function that enables the caller to supply their own replacement character. + + +## Points of interest + +#### Design goals and decisions + +The library was designed to be: + +1. Generic: for better or worse, there are many C++ string classes out there, and the library should work with as many of them as possible. +2. Portable: the library should be portable both accross different platforms and compilers. The only non-portable code is a small section that declares unsigned integers of different sizes: three typedefs. They can be changed by the users of the library if they don't match their platform. The default setting should work for Windows (both 32 and 64 bit), and most 32 bit and 64 bit Unix derivatives. Support for post C++03 language features is included for modern compilers at API level only, so the library should work even with pretty old compilers. +3. Lightweight: follow the "pay only for what you use" guideline. +4. Unintrusive: avoid forcing any particular design or even programming style on the user. This is a library, not a framework. + +#### Alternatives + +In case you want to look into other means of working with UTF-8 strings from C++, here is the list of solutions I am aware of: + +1. [ICU Library](http://icu.sourceforge.net/). It is very powerful, complete, feature-rich, mature, and widely used. Also big, intrusive, non-generic, and doesn't play well with the Standard Library. I definitelly recommend looking at ICU even if you don't plan to use it. +2. C++11 language and library features. Still far from complete, and not easy to use. +3. [Glib::ustring](http://www.gtkmm.org/gtkmm2/docs/tutorial/html/ch03s04.html). A class specifically made to work with UTF-8 strings, and also feel like `std::string`. If you prefer to have yet another string class in your code, it may be worth a look. Be aware of the licensing issues, though. +4. Platform dependent solutions: Windows and POSIX have functions to convert strings from one encoding to another. That is only a subset of what my library offers, but if that is all you need it may be good enough. + + +## Reference + +### Functions From utf8 Namespace + +#### utf8::append + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. + +```cpp +void append(char32_t cp, std::string& s); +``` + +`cp`: a code point to append to the string. +`s`: a utf-8 encoded string to append the code point to. + +Example of use: + +```cpp +std::string u; +append(0x0448, u); +assert (u[0] == char(0xd1) && u[1] == char(0x88) && u.length() == 2); +``` + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + +#### utf8::append + +Available in version 1.0 and later. + +Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. + +```cpp +template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`cp`: a 32 bit integer representing a code point to append to the sequence. +`result`: an output iterator to the place in the sequence where to append the code point. +Return value: an iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +``` + +Note that `append` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append` can add anywhere between 1 and 4 octets to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::next + +Available in version 1.0 and later. + +Given the iterator to the beginning of the UTF-8 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template <typename octet_iterator> +uint32_t next(octet_iterator& it, octet_iterator end); +``` + +`octet_iterator`: an input iterator. +`it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. +`end`: end of the UTF-8 sequence to be processed. If `it` gets equal to `end` during the extraction of a code point, an `utf8::not_enough_room` exception is thrown. +Return value: the 32 bit representation of the processed UTF-8 code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = next(w, twochars + 6); +assert (cp == 0x65e5); +assert (w == twochars + 3); +``` + +This function is typically used to iterate through a UTF-8 encoded string. + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + +#### utf8::peek_next + +Available in version 2.1 and later. + +Given the iterator to the beginning of the UTF-8 sequence, it returns the code point for the following sequence without changing the value of the iterator. + +```cpp +template <typename octet_iterator> +uint32_t peek_next(octet_iterator it, octet_iterator end); +``` + + +`octet_iterator`: an input iterator. +`it`: an iterator pointing to the beginning of an UTF-8 encoded code point. +`end`: end of the UTF-8 sequence to be processed. If `it` gets equal to `end` during the extraction of a code point, an `utf8::not_enough_room` exception is thrown. +Return value: the 32 bit representation of the processed UTF-8 code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = peek_next(w, twochars + 6); +assert (cp == 0x65e5); +assert (w == twochars); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + +#### utf8::prior + +Available in version 1.02 and later. + +Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point. + +```cpp +template <typename octet_iterator> +uint32_t prior(octet_iterator& it, octet_iterator start); +``` + +`octet_iterator`: a bidirectional iterator. +`it`: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point. +`start`: an iterator to the beginning of the sequence where the search for the beginning of a code point is performed. It is a safety measure to prevent passing the beginning of the string in the search for a UTF-8 lead octet. + Return value: the 32 bit representation of the previous code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars + 3; +int cp = prior (w, twochars); +assert (cp == 0x65e5); +assert (w == twochars); +``` + +This function has two purposes: one is two iterate backwards through a UTF-8 encoded string. Note that it is usually a better idea to iterate forward instead, since `utf8::next` is faster. The second purpose is to find a beginning of a UTF-8 sequence if we have a random position within a string. Note that in that case `utf8::prior` may not detect an invalid UTF-8 sequence in some scenarios: for instance if there are superfluous trail octets, it will just skip them. + +`it` will typically point to the beginning of a code point, and `start` will point to the beginning of the string to ensure we don't go backwards too far. `it` is decreased until it points to a lead UTF-8 octet, and then the UTF-8 sequence beginning with that octet is decoded to a 32 bit representation and returned. + +In case `start` is reached before a UTF-8 lead octet is hit, or if an invalid UTF-8 sequence is started by the lead octet, an `invalid_utf8` exception is thrown. + +In case `start` equals `it`, a `not_enough_room` exception is thrown. + +#### utf8::advance +Available in version 1.0 and later. + +Advances an iterator by the specified number of code points within an UTF-8 sequence. + +```cpp +template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n, octet_iterator end); +``` + +`octet_iterator`: an input iterator. +`distance_type`: an integral type convertible to `octet_iterator`'s difference type. +`it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point. +`n`: number of code points `it` should be advanced. A negative value means decrement. +`end`: limit of the UTF-8 sequence to be processed. If `n` is positive and `it` gets equal to `end` during the extraction of a code point, an `utf8::not_enough_room` exception is thrown. If `n` is negative and `it` reaches `end` while `it` points t a trail byte of a UTF-8 sequence, a `utf8::invalid_code_point` exception is thrown. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +unsigned char* w = twochars; +advance (w, 2, twochars + 6); +assert (w == twochars + 5); +advance (w, -2, twochars); +assert (w == twochars); +``` + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::distance + +Available in version 1.0 and later. + +Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. + +```cpp +template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last); +``` + +`octet_iterator`: an input iterator. +`first`: an iterator to a beginning of a UTF-8 encoded code point. +`last`: an iterator to a "post-end" of the last UTF-8 encoded code point in the sequence we are trying to determine the length. It can be the beginning of a new code point, or not. + Return value the distance between the iterators, in code points. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::distance(twochars, twochars + 5); +assert (dist == 2); +``` + +This function is used to find the length (in code points) of a UTF-8 encoded string. The reason it is called _distance_, rather than, say, _length_ is mainly because developers are used that _length_ is an O(1) function. Computing the length of an UTF-8 string is a linear operation, and it looked better to model it after `std::distance` algorithm. + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `last` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. + +#### utf8::utf16to8 + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::string utf16to8(const std::u16string& s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string u = utf16to8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + + +#### utf8::utf16to8 + +Available in version 1.0 and later. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +``` + +`u16bit_iterator`: an input iterator. +`octet_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + +#### utf8::utf8to16 + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(const std::string& s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +u16string utf16result = utf8to16(utf8_with_surrogates); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + +#### utf8::utf8to16 + +Available in version 1.0 and later. + +Converts an UTF-8 encoded string to UTF-16 + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +``` + +`octet_iterator`: an input iterator. +`u16bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-16 string. + +Example of use: + +```cpp +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. + +#### utf8::utf32to8 + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::utf32to8 + +Available in version 1.0 and later. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`u32bit_iterator`: an input iterator. +`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::utf8to32 + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +std::u32string utf8to32(const std::string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. + +Example of use: + +```cpp +const char* twochars = "\xe6\x97\xa5\xd1\x88"; +u32string utf32result = utf8to32(twochars); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + +#### utf8::utf8to32 + +Available in version 1.0 and later. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); +``` + +`octet_iterator`: an input iterator. +`u32bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-32 string. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 seqence, a `utf8::not_enough_room` exception is thrown. + +#### utf8::find_invalid + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Detects an invalid sequence within a UTF-8 string. + +```cpp +std::size_t find_invalid(const std::string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: the index of the first invalid octet in the UTF-8 string. In case none were found, equals `std::string::npos`. + +Example of use: + +```cpp +string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; +auto invalid = find_invalid(utf_invalid); +assert (invalid == 5); +``` + +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. + +#### utf8::find_invalid + +Available in version 1.0 and later. + +Detects an invalid sequence within a UTF-8 string. + +```cpp +template <typename octet_iterator> +octet_iterator find_invalid(octet_iterator start, octet_iterator end); +``` + +`octet_iterator`: an input iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. +Return value: an iterator pointing to the first invalid octet in the UTF-8 string. In case none were found, equals `end`. + +Example of use: + +```cpp +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); +assert (invalid == utf_invalid + 5); +``` + +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. + +#### utf8::is_valid + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Checks whether a string object contains valid UTF-8 encoded text. + +```cpp +bool is_valid(const std::string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: `true` if the string contains valid UTF-8 encoded text; `false` if not. + +Example of use: + +```cpp +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid); +assert (bvalid == false); +``` + +You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. + +#### utf8::is_valid + +Available in version 1.0 and later. + +Checks whether a sequence of octets is a valid UTF-8 string. + +```cpp +template <typename octet_iterator> +bool is_valid(octet_iterator start, octet_iterator end); +``` + +`octet_iterator`: an input iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. +Return value: `true` if the sequence is a valid UTF-8 string; `false` if not. + +Example of use: + +```cpp +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); +assert (bvalid == false); +``` + +`is_valid` is a shorthand for `find_invalid(start, end) == end;`. You may want to use it to make sure that a byte seqence is a valid UTF-8 string without the need to know where it fails if it is not valid. + +#### utf8::replace_invalid + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Replaces all invalid UTF-8 sequences within a string with a replacement marker. + +```cpp +std::string replace_invalid(const std::string& s, char32_t replacement); +std::string replace_invalid(const std::string& s); +``` + +`s`: a UTF-8 encoded string. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: A UTF-8 encoded string with replaced invalid sequences. + +Example of use: + +```cpp +string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +string replace_invalid_result = replace_invalid(invalid_sequence, '?'); +bvalid = is_valid(replace_invalid_result); +assert (bvalid); +const string fixed_invalid_sequence = "a????z"; +assert (fixed_invalid_sequence == replace_invalid_result); +``` + +#### utf8::replace_invalid + +Available in version 2.0 and later. + +Replaces all invalid UTF-8 sequences within a string with a replacement marker. + +```cpp +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +``` + +`octet_iterator`: an input iterator. +`output_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. +`out`: An output iterator to the range where the result of replacement is stored. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. + +Example of use: + +```cpp +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; +replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); +``` + +`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. + +#### utf8::starts_with_bom + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Checks whether a string starts with a UTF-8 byte order mark (BOM) + +```cpp +bool starts_with_bom(const std::string& s); +``` + +`s`: a UTF-8 encoded string. +Return value: `true` if the string starts with a UTF-8 byte order mark; `false` if not. + +Example of use: + +```cpp +string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; +bool bbom = starts_with_bom(byte_order_mark); +assert (bbom == true); +string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +bool no_bbom = starts_with_bom(threechars); +assert (no_bbom == false); + ``` + +The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. + + +#### utf8::starts_with_bom + +Available in version 2.3 and later. + +Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) + +```cpp +template <typename octet_iterator> +bool starts_with_bom (octet_iterator it, octet_iterator end); +``` + +`octet_iterator`: an input iterator. +`it`: beginning of the octet sequence to check +`end`: pass-end of the sequence to check +Return value: `true` if the sequence starts with a UTF-8 byte order mark; `false` if not. + +Example of use: + +```cpp +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); +assert (bbom == true); +``` + +The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. + +### Types From utf8 Namespace + +#### utf8::exception + +Available in version 2.3 and later. + +Base class for the exceptions thrown by UTF CPP library functions. + +```cpp +class exception : public std::exception {}; +``` + +Example of use: + +```cpp +try { + code_that_uses_utf_cpp_library(); +} +catch(const utf8::exception& utfcpp_ex) { + cerr << utfcpp_ex.what(); +} +``` + +#### utf8::invalid_code_point + +Available in version 1.0 and later. + +Thrown by UTF8 CPP functions such as `advance` and `next` if an UTF-8 sequence represents and invalid code point. + +```cpp +class invalid_code_point : public exception { +public: + uint32_t code_point() const; +}; +``` + +Member function `code_point()` can be used to determine the invalid code point that caused the exception to be thrown. + +#### utf8::invalid_utf8 + +Available in version 1.0 and later. + +Thrown by UTF8 CPP functions such as `next` and `prior` if an invalid UTF-8 sequence is detected during decoding. + +```cpp +class invalid_utf8 : public exception { +public: + uint8_t utf8_octet() const; +}; +``` + +Member function `utf8_octet()` can be used to determine the beginning of the byte sequence that caused the exception to be thrown. + +#### utf8::invalid_utf16 + +Available in version 1.0 and later. + +Thrown by UTF8 CPP function `utf16to8` if an invalid UTF-16 sequence is detected during decoding. + +```cpp +class invalid_utf16 : public exception { +public: + uint16_t utf16_word() const; +}; +``` + +Member function `utf16_word()` can be used to determine the UTF-16 code unit that caused the exception to be thrown. + +#### utf8::not_enough_room + +Available in version 1.0 and later. + +Thrown by UTF8 CPP functions such as `next` if the end of the decoded UTF-8 sequence was reached before the code point was decoded. + +```cpp +class not_enough_room : public exception {}; +``` + +#### utf8::iterator + +Available in version 2.0 and later. + +Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets. + +```cpp +template <typename octet_iterator> +class iterator; +``` + +##### Member functions + +`iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. + +`explicit iterator (const octet_iterator& octet_it, const octet_iterator& range_start, const octet_iterator& range_end);` a constructor that initializes the underlying octet_iterator with octet_it and sets the range in which the iterator is considered valid. + +`octet_iterator base () const;` returns the underlying octet_iterator. + +`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. + +`bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal. + +`bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal. + +`iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point. + +`iterator operator ++ (int);` the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one. + +`iterator& operator -- ();` the prefix decrement - moves the iterator to the previous UTF-8 encoded code point. + +`iterator operator -- (int);` the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one. + +Example of use: + +```cpp +char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +utf8::iterator<char*> it(threechars, threechars, threechars + 9); +utf8::iterator<char*> it2 = it; +assert (it2 == it); +assert (*it == 0x10346); +assert (*(++it) == 0x65e5); +assert ((*it++) == 0x65e5); +assert (*it == 0x0448); +assert (it != it2); +utf8::iterator<char*> endit (threechars + 9, threechars, threechars + 9); +assert (++it == endit); +assert (*(--it) == 0x0448); +assert ((*it--) == 0x0448); +assert (*it == 0x65e5); +assert (--it == utf8::iterator<char*>(threechars, threechars, threechars + 9)); +assert (*it == 0x10346); +``` + +The purpose of `utf8::iterator` adapter is to enable easy iteration as well as the use of STL algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of `utf8::next()` and `utf8::prior()` functions. + +Note that `utf8::iterator` adapter is a checked iterator. It operates on the range specified in the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically, the range will be determined by sequence container functions `begin` and `end`, i.e.: + +```cpp +std::string s = "example"; +utf8::iterator i (s.begin(), s.begin(), s.end()); +``` + +### Functions From utf8::unchecked Namespace + +#### utf8::unchecked::append + +Available in version 1.0 and later. + +Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. + +```cpp +template <typename octet_iterator> +octet_iterator append(uint32_t cp, octet_iterator result); +``` + +`cp`: A 32 bit integer representing a code point to append to the sequence. +`result`: An output iterator to the place in the sequence where to append the code point. +Return value: An iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = unchecked::append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +``` + +This is a faster but less safe version of `utf8::append`. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence. + +#### utf8::unchecked::next + +Available in version 1.0 and later. + +Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template <typename octet_iterator> +uint32_t next(octet_iterator& it); +``` + +`it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. + Return value: the 32 bit representation of the processed UTF-8 code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = unchecked::next(w); +assert (cp == 0x65e5); +assert (w == twochars + 3); +``` + +This is a faster but less safe version of `utf8::next`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::unchecked::peek_next + +Available in version 2.1 and later. + +Given the iterator to the beginning of a UTF-8 sequence, it returns the code point. + +```cpp +template <typename octet_iterator> +uint32_t peek_next(octet_iterator it); +``` + +`it`: an iterator pointing to the beginning of an UTF-8 encoded code point. +Return value: the 32 bit representation of the processed UTF-8 code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +int cp = unchecked::peek_next(w); +assert (cp == 0x65e5); +assert (w == twochars); +``` + +This is a faster but less safe version of `utf8::peek_next`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::unchecked::prior + +Available in version 1.02 and later. + +Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point. + +```cpp +template <typename octet_iterator> +uint32_t prior(octet_iterator& it); +``` + +`it`: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point. + Return value: the 32 bit representation of the previous code point. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars + 3; +int cp = unchecked::prior (w); +assert (cp == 0x65e5); +assert (w == twochars); +``` + +This is a faster but less safe version of `utf8::prior`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. + +#### utf8::unchecked::advance + +Available in version 1.0 and later. + +Advances an iterator by the specified number of code points within an UTF-8 sequence. + +```cpp +template <typename octet_iterator, typename distance_type> +void advance (octet_iterator& it, distance_type n); +``` + +`it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point. +`n`: number of code points `it` should be advanced. A negative value means decrement. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +char* w = twochars; +unchecked::advance (w, 2); +assert (w == twochars + 5); +``` + +This is a faster but less safe version of `utf8::advance`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. + +#### utf8::unchecked::distance + +Available in version 1.0 and later. + +Given the iterators to two UTF-8 encoded code points in a seqence, returns the number of code points between them. + +```cpp +template <typename octet_iterator> +typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last); +``` + +`first`: an iterator to a beginning of a UTF-8 encoded code point. +`last`: an iterator to a "post-end" of the last UTF-8 encoded code point in the sequence we are trying to determine the length. It can be the beginning of a new code point, or not. +Return value: the distance between the iterators, in code points. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +size_t dist = utf8::unchecked::distance(twochars, twochars + 5); +assert (dist == 2); +``` + +This is a faster but less safe version of `utf8::distance`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::unchecked::utf16to8 + +Available in version 1.0 and later. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +``` + +`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector<unsigned char> utf8result; +unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10); +``` + +This is a faster but less safe version of `utf8::utf16to8`. It does not check for validity of the supplied UTF-16 sequence. + +#### utf8::unchecked::utf8to16 + +Available in version 1.0 and later. + +Converts an UTF-8 encoded string to UTF-16 + +```cpp +template <typename u16bit_iterator, typename octet_iterator> +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +``` + +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-16 string. + +Example of use: + +```cpp +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector <unsigned short> utf16result; +unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +This is a faster but less safe version of `utf8::utf8to16`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::unchecked::utf32to8 + +Available in version 1.0 and later. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +template <typename octet_iterator, typename u32bit_iterator> +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +``` + +`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +int utf32string[] = {0x448, 0x65e5, 0x10346, 0}; +vector<unsigned char> utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); +``` + +This is a faster but less safe version of `utf8::utf32to8`. It does not check for validity of the supplied UTF-32 sequence. + +#### utf8::unchecked::utf8to32 + +Available in version 1.0 and later. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +template <typename octet_iterator, typename u32bit_iterator> +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); +``` + +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-32 string. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector<int> utf32result; +unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); +``` + +This is a faster but less safe version of `utf8::utf8to32`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::unchecked::replace_invalid + +Available in version 3.1 and later. + +Replaces all invalid UTF-8 sequences within a string with a replacement marker. + +```cpp +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +template <typename octet_iterator, typename output_iterator> +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +``` + +`octet_iterator`: an input iterator. +`output_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. +`out`: An output iterator to the range where the result of replacement is stored. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. + +Example of use: + +```cpp +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector<char> replace_invalid_result; +unchecked::replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); +``` + +`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. + +Unlike `utf8::replace_invalid`, this function does not verify validity of the replacement marker. + +### Types From utf8::unchecked Namespace + +#### utf8::iterator + +Available in version 2.0 and later. + +Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets. + +```cpp +template <typename octet_iterator> +class iterator; +``` + +##### Member functions + +`iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. + +`explicit iterator (const octet_iterator& octet_it);` a constructor that initializes the underlying octet_iterator with `octet_it`. + +`octet_iterator base () const;` returns the underlying octet_iterator. + +`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. + +`bool operator == (const iterator& rhs) const;` returns `true` if the two underlaying iterators are equal. + +`bool operator != (const iterator& rhs) const;` returns `true` if the two underlaying iterators are not equal. + +`iterator& operator ++ ();` the prefix increment - moves the iterator to the next UTF-8 encoded code point. + +`iterator operator ++ (int);` the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one. + +`iterator& operator -- ();` the prefix decrement - moves the iterator to the previous UTF-8 encoded code point. + +`iterator operator -- (int);` the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one. + +Example of use: + +```cpp +char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; +utf8::unchecked::iterator<char*> un_it(threechars); +utf8::unchecked::iterator<char*> un_it2 = un_it; +assert (un_it2 == un_it); +assert (*un_it == 0x10346); +assert (*(++un_it) == 0x65e5); +assert ((*un_it++) == 0x65e5); +assert (*un_it == 0x0448); +assert (un_it != un_it2); +utf8::::unchecked::iterator<char*> un_endit (threechars + 9); +assert (++un_it == un_endit); +assert (*(--un_it) == 0x0448); +assert ((*un_it--) == 0x0448); +assert (*un_it == 0x65e5); +assert (--un_it == utf8::unchecked::iterator<char*>(threechars)); +assert (*un_it == 0x10346); +``` + +This is an unchecked version of `utf8::iterator`. It is faster in many cases, but offers no validity or range checks. + +## Links + +1. [The Unicode Consortium](http://www.unicode.org/). +2. [ICU Library](http://icu.sourceforge.net/). +3. [UTF-8 at Wikipedia](http://en.wikipedia.org/wiki/UTF-8) +4. [UTF-8 and Unicode FAQ for Unix/Linux](http://www.cl.cam.ac.uk/~mgk25/unicode.html) diff --git a/3rdparty/utfcpp/samples/docsample.cpp b/3rdparty/utfcpp/samples/docsample.cpp new file mode 100644 index 000000000..653388725 --- /dev/null +++ b/3rdparty/utfcpp/samples/docsample.cpp @@ -0,0 +1,64 @@ +#include "../source/utf8.h" +#include <iostream> +#include <fstream> +#include <string> +#include <vector> + + +using namespace std; + +int main(int argc, char** argv) +{ + if (argc != 2) { + cout << "\nUsage: docsample filename\n"; + return 0; + } + const char* test_file_path = argv[1]; + // Open the test file (must be UTF-8 encoded) + ifstream fs8(test_file_path); + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 0; + } + + unsigned line_count = 1; + string line; + // Play with all the lines in the file + while (getline(fs8, line)) { + // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) +#if __cplusplus >= 201103L // C++ 11 or later + auto end_it = utf8::find_invalid(line.begin(), line.end()); +#else + string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); +#endif // C++ 11 + if (end_it != line.end()) { + cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; + cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; + } + // Get the line length (at least for the valid part) + ptrdiff_t length = utf8::distance(line.begin(), end_it); + cout << "Length of line " << line_count << " is " << length << "\n"; + + // Convert it to utf-16 +#if __cplusplus >= 201103L // C++ 11 or later + u16string utf16line = utf8::utf8to16(line); +#else + vector<unsigned short> utf16line; + utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); +#endif // C++ 11 + // And back to utf-8; +#if __cplusplus >= 201103L // C++ 11 or later + string utf8line = utf8::utf16to8(utf16line); +#else + string utf8line; + utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); +#endif // C++ 11 + // Confirm that the conversion went OK: + if (utf8line != string(line.begin(), end_it)) + cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; + + line_count++; + } + + return 0; +} diff --git a/3rdparty/utfcpp/source/utf8.h b/3rdparty/utfcpp/source/utf8.h new file mode 100644 index 000000000..82b13f59f --- /dev/null +++ b/3rdparty/utfcpp/source/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/3rdparty/utfcpp/source/utf8/checked.h b/3rdparty/utfcpp/source/utf8/checked.h new file mode 100644 index 000000000..71b9076f6 --- /dev/null +++ b/3rdparty/utfcpp/source/utf8/checked.h @@ -0,0 +1,333 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include <stdexcept> + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template <typename octet_iterator> + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast<uint8_t>(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + return result; + } + + template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template <typename octet_iterator, typename output_iterator> + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template <typename octet_iterator> + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template <typename octet_iterator> + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template <typename octet_iterator> + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + template <typename octet_iterator, typename distance_type> + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } + } + + template <typename octet_iterator> + typename std::iterator_traits<octet_iterator>::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits<octet_iterator>::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); + } + else + throw invalid_utf16(static_cast<uint16_t>(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast<uint16_t>(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast<uint16_t>(cp); + } + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template <typename octet_iterator> + class iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + typedef uint32_t value_type; + typedef uint32_t* pointer; + typedef uint32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later + +#endif //header guard + diff --git a/3rdparty/utfcpp/source/utf8/core.h b/3rdparty/utfcpp/source/utf8/core.h new file mode 100644 index 000000000..de6199f2a --- /dev/null +++ b/3rdparty/utfcpp/source/utf8/core.h @@ -0,0 +1,338 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include <iterator> + +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() +#endif // C++ 11 or later + + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template<typename octet_type> + inline uint8_t mask8(octet_type oc) + { + return static_cast<uint8_t>(0xff & oc); + } + template<typename u16_type> + inline uint16_t mask16(u16_type oc) + { + return static_cast<uint16_t>(0xffff & oc); + } + template<typename octet_type> + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template <typename u16> + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template <typename u16> + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template <typename u16> + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template <typename u32> + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template <typename octet_iterator> + inline typename std::iterator_traits<octet_iterator>::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template <typename octet_difference_type> + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template <typename octet_iterator> + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template <typename octet_iterator> + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template <typename octet_iterator> + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template <typename octet_iterator> + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template <typename octet_iterator> + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template <typename octet_iterator> + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template <typename octet_iterator> + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/3rdparty/utfcpp/source/utf8/cpp11.h b/3rdparty/utfcpp/source/utf8/cpp11.h new file mode 100644 index 000000000..d93961b04 --- /dev/null +++ b/3rdparty/utfcpp/source/utf8/cpp11.h @@ -0,0 +1,103 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" +#include <string> + +namespace utf8 +{ + + inline void append(char32_t cp, std::string& s) + { + append(uint32_t(cp), std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : (invalid - s.begin()); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(const std::string& s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/3rdparty/utfcpp/source/utf8/unchecked.h b/3rdparty/utfcpp/source/utf8/unchecked.h new file mode 100644 index 000000000..0e1b51cc7 --- /dev/null +++ b/3rdparty/utfcpp/source/utf8/unchecked.h @@ -0,0 +1,274 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template <typename octet_iterator> + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast<uint8_t>(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + return result; + } + + template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template <typename octet_iterator, typename output_iterator> + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + + template <typename octet_iterator> + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template <typename octet_iterator> + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template <typename octet_iterator> + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + template <typename octet_iterator, typename distance_type> + void advance (octet_iterator& it, distance_type n) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } + } + + template <typename octet_iterator> + typename std::iterator_traits<octet_iterator>::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits<octet_iterator>::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast<uint16_t>(cp); + } + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template <typename octet_iterator> + class iterator { + octet_iterator it; + public: + typedef uint32_t value_type; + typedef uint32_t* pointer; + typedef uint32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/3rdparty/utfcpp/tests/CMakeLists.txt b/3rdparty/utfcpp/tests/CMakeLists.txt new file mode 100644 index 000000000..06e0d7e9c --- /dev/null +++ b/3rdparty/utfcpp/tests/CMakeLists.txt @@ -0,0 +1,41 @@ +add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp) +add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp) +add_executable(apitests + ${PROJECT_SOURCE_DIR}/tests/test_checked_api.cpp + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_api.cpp + ${PROJECT_SOURCE_DIR}/tests/test_checked_iterator.cpp + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_iterator.cpp +) + +add_executable(noexceptionstests + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_api.cpp + ${PROJECT_SOURCE_DIR}/tests/test_unchecked_iterator.cpp +) + +target_link_libraries(negative PRIVATE utf8::cpp) +target_link_libraries(cpp11 PRIVATE + utf8::cpp + gtest_main + ) +target_link_libraries(apitests PRIVATE + utf8::cpp + gtest_main +) + +target_link_libraries(noexceptionstests PRIVATE + utf8::cpp + gtest_main +) +target_compile_options(noexceptionstests PUBLIC -fno-exceptions) + +set_target_properties(negative + PROPERTIES + CXX_STANDARD 98 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) + +add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt) +add_test(cpp11_test cpp11) +add_test(api_test apitests) +add_test(noexceptions_test noexceptionstests) + diff --git a/3rdparty/utfcpp/tests/docker/Dockerfile b/3rdparty/utfcpp/tests/docker/Dockerfile new file mode 100644 index 000000000..125a26936 --- /dev/null +++ b/3rdparty/utfcpp/tests/docker/Dockerfile @@ -0,0 +1,5 @@ +FROM debian:stretch-slim + +RUN apt-get update \ + && apt-get install -y make g++ cmake git \ + && rm -rf /var/lib/apt/lists/* diff --git a/3rdparty/utfcpp/tests/negative.cpp b/3rdparty/utfcpp/tests/negative.cpp new file mode 100644 index 000000000..f1bcc993e --- /dev/null +++ b/3rdparty/utfcpp/tests/negative.cpp @@ -0,0 +1,59 @@ +#include "utf8.h" +using namespace utf8; + +#include <string> +#include <iostream> +#include <fstream> +#include <algorithm> +using namespace std; + +const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264}; +const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned); + +int main(int argc, char** argv) +{ + string test_file_path; + if (argc == 2) + test_file_path = argv[1]; + else { + cout << "Wrong number of arguments" << endl; + return 1; + } + // Open the test file + ifstream fs8(test_file_path.c_str()); + if (!fs8.is_open()) { + cout << "Could not open " << test_file_path << endl; + return 1; + } + + // Read it line by line + unsigned int line_count = 0; + char byte; + while (!fs8.eof()) { + string line; + while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof()) + line.push_back(byte); + + line_count++; + bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END); + // Print out lines that contain unexpected invalid UTF-8 + if (!is_valid(line.begin(), line.end())) { + if (expected_valid) { + cout << "Unexpected invalid utf-8 at line " << line_count << '\n'; + return 1; + } + + // try fixing it: + string fixed_line; + replace_invalid(line.begin(), line.end(), back_inserter(fixed_line)); + if (!is_valid(fixed_line.begin(), fixed_line.end())) { + cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n'; + return 1; + } + } + else if (!expected_valid) { + cout << "Invalid utf-8 NOT detected at line " << line_count << '\n'; + return 1; + } + } +} diff --git a/3rdparty/utfcpp/tests/test_checked_api.cpp b/3rdparty/utfcpp/tests/test_checked_api.cpp new file mode 100644 index 000000000..6787da62e --- /dev/null +++ b/3rdparty/utfcpp/tests/test_checked_api.cpp @@ -0,0 +1,188 @@ +#include "gtest/gtest.h" +#include "utf8.h" + +#include <string> +#include <vector> +using namespace utf8; +using namespace std; + + +TEST(CheckedAPITests, test_append) +{ + unsigned char u[5] = {0,0,0,0,0}; + append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); +} + +TEST(CheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + int cp = next(w, twochars + 6); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = next(w, threechars + 9); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(CheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + int cp = peek_next(cw, cw + 6); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(CheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + int cp = prior (w, twochars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w, threechars); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(CheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + advance(w, 2, threechars + 9); + EXPECT_EQ(w, threechars + 7); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars); + advance(w, 3, threechars + 9); + EXPECT_EQ(w, threechars + 9); + advance(w, -2, threechars); + EXPECT_EQ(w, threechars + 4); + advance(w, -1, threechars); + EXPECT_EQ(w, threechars); +} + +TEST(CheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = utf8::distance(twochars, twochars + 5); + EXPECT_EQ (dist, 2); +} + +TEST(CheckedAPITests, test_utf32to8) +{ + int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector<int> utf32result; + utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CheckedAPITests, test_utf16to8) +{ + unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(CheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector <unsigned short> utf16result; + utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector<char> replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + +TEST(CheckedAPITests, test_find_invalid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); +} + +TEST(CheckedAPITests, test_is_valid) +{ + char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid, utf_invalid + 6); + EXPECT_FALSE (bvalid); + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); + EXPECT_TRUE (bvalid); +} + +TEST(CheckedAPITests, test_starts_with_bom) +{ + unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; + bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); + EXPECT_TRUE (bbom); + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars)); + EXPECT_FALSE (no_bbom); +} diff --git a/3rdparty/utfcpp/tests/test_checked_iterator.cpp b/3rdparty/utfcpp/tests/test_checked_iterator.cpp new file mode 100644 index 000000000..4c44834fd --- /dev/null +++ b/3rdparty/utfcpp/tests/test_checked_iterator.cpp @@ -0,0 +1,31 @@ +#include "gtest/gtest.h" +#include "utf8.h" + +using namespace utf8; + + +TEST(CheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::iterator<const char*> it(threechars, threechars, threechars + 9); + utf8::iterator<const char*> it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + utf8::iterator<const char*> endit (threechars + 9, threechars, threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(CheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::iterator<const char*> it(threechars+9, threechars, threechars + 9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, utf8::iterator<const char*>(threechars, threechars, threechars + 9)); + EXPECT_EQ (*it, 0x10346); +} diff --git a/3rdparty/utfcpp/tests/test_cpp11.cpp b/3rdparty/utfcpp/tests/test_cpp11.cpp new file mode 100644 index 000000000..edcff9d31 --- /dev/null +++ b/3rdparty/utfcpp/tests/test_cpp11.cpp @@ -0,0 +1,106 @@ +#include "gtest/gtest.h" +#include "utf8.h" +#include <string> +using namespace utf8; +using namespace std; + +#if __cplusplus >= 201103L // C++ 11 or later + +TEST(CPP11APITests, test_append) +{ + string u; + append(0x0448, u); + EXPECT_EQ (u[0], char(0xd1)); + EXPECT_EQ (u[1], char(0x88)); + EXPECT_EQ (u.length(), 2); + + u.clear(); + append(0x65e5, u); + EXPECT_EQ (u[0], char(0xe6)); + EXPECT_EQ (u[1], char(0x97)); + EXPECT_EQ (u[2], char(0xa5)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x3044, u); + EXPECT_EQ (u[0], char(0xe3)); + EXPECT_EQ (u[1], char(0x81)); + EXPECT_EQ (u[2], char(0x84)); + EXPECT_EQ (u.length(), 3); + + u.clear(); + append(0x10346, u); + EXPECT_EQ (u[0], char(0xf0)); + EXPECT_EQ (u[1], char(0x90)); + EXPECT_EQ (u[2], char(0x8d)); + EXPECT_EQ (u[3], char(0x86)); + EXPECT_EQ (u.length(), 4); +} + +TEST(CPP11APITests, test_utf16to8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string u = utf16to8(utf16string); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP11APITests, test_utf8to16) +{ + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP11APITests, test_utf32to8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + string utf8result = utf32to8(utf32string); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP11APITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP11APITests, test_find_invalid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP11APITests, test_is_valid) +{ + string utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP11APITests, test_replace_invalid) +{ + string invalid_sequence = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + string replace_invalid_result = replace_invalid(invalid_sequence, '?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const string fixed_invalid_sequence = "a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP11APITests, test_starts_with_bom) +{ + string byte_order_mark = {char(0xef), char(0xbb), char(0xbf)}; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + string threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} +#endif // C++ 11 or later diff --git a/3rdparty/utfcpp/tests/test_data/utf8_invalid.txt b/3rdparty/utfcpp/tests/test_data/utf8_invalid.txt Binary files differnew file mode 100644 index 000000000..ae8315932 --- /dev/null +++ b/3rdparty/utfcpp/tests/test_data/utf8_invalid.txt diff --git a/3rdparty/utfcpp/tests/test_unchecked_api.cpp b/3rdparty/utfcpp/tests/test_unchecked_api.cpp new file mode 100644 index 000000000..e9f19ca6c --- /dev/null +++ b/3rdparty/utfcpp/tests/test_unchecked_api.cpp @@ -0,0 +1,161 @@ +#include "gtest/gtest.h" +#include "utf8/unchecked.h" + +#include <string> +#include <vector> +using namespace utf8::unchecked; +using namespace std; + +TEST(UnCheckedAPITests, test_append) +{ + unsigned char u[5] = {0,0,0,0,0}; + append(0x0448, u); + EXPECT_EQ (u[0], 0xd1); + EXPECT_EQ (u[1], 0x88); + EXPECT_EQ (u[2], 0); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x65e5, u); + EXPECT_EQ (u[0], 0xe6); + EXPECT_EQ (u[1], 0x97); + EXPECT_EQ (u[2], 0xa5); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x3044, u); + EXPECT_EQ (u[0], 0xe3); + EXPECT_EQ (u[1], 0x81); + EXPECT_EQ (u[2], 0x84); + EXPECT_EQ (u[3], 0); + EXPECT_EQ (u[4], 0); + + append(0x10346, u); + EXPECT_EQ (u[0], 0xf0); + EXPECT_EQ (u[1], 0x90); + EXPECT_EQ (u[2], 0x8d); + EXPECT_EQ (u[3], 0x86); + EXPECT_EQ (u[4], 0); +} + +TEST(UnCheckedAPITests, test_next) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars; + int cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars + 3); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars; + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars + 4); + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 7); + + cp = utf8::unchecked::next(w); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 9); +} + +TEST(UnCheckedAPITests, test_peek_next) +{ + const char* const cw = "\xe6\x97\xa5\xd1\x88"; + int cp = peek_next(cw); + EXPECT_EQ (cp, 0x65e5); +} + +TEST(UnCheckedAPITests, test_prior) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + const char* w = twochars + 3; + int cp = prior (w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, twochars); + + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + w = threechars + 9; + cp = prior(w); + EXPECT_EQ (cp, 0x0448); + EXPECT_EQ (w, threechars + 7); + cp = prior(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, threechars + 4); + cp = prior(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, threechars); +} + +TEST(UnCheckedAPITests, test_advance) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + const char* w = threechars; + utf8::unchecked::advance(w, 2); + EXPECT_EQ(w, threechars + 7); + utf8::unchecked::advance(w, -2); + EXPECT_EQ(w, threechars); + utf8::unchecked::advance(w, 3); + EXPECT_EQ(w, threechars + 9); + utf8::unchecked::advance(w, -2); + EXPECT_EQ(w, threechars + 4); + utf8::unchecked::advance(w, -1); + EXPECT_EQ(w, threechars); +} + +TEST(UnCheckedAPITests, test_distance) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + size_t dist = utf8::unchecked::distance(twochars, twochars + 5); + EXPECT_EQ (dist, 2); +} + +TEST(UnCheckedAPITests, test_utf32to8) +{ + int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; + string utf8result; + utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(UnCheckedAPITests, test_utf8to32) +{ + const char* twochars = "\xe6\x97\xa5\xd1\x88"; + vector<int> utf32result; + utf8to32(twochars, twochars + 5, back_inserter(utf32result)); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(UnCheckedAPITests, test_utf16to8) +{ + unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + string utf8result; + utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); + EXPECT_EQ (utf8result.size(), 10); +} + +TEST(UnCheckedAPITests, test_utf8to16) +{ + char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + vector <unsigned short> utf16result; + utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(UnCheckedAPITests, test_replace_invalid) +{ + char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + vector<char> replace_invalid_result; + replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?'); + bool bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); + EXPECT_TRUE (bvalid); + const char fixed_invalid_sequence[] = "a????z"; + EXPECT_EQ (sizeof(fixed_invalid_sequence), replace_invalid_result.size()); + EXPECT_TRUE (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence)); +} + diff --git a/3rdparty/utfcpp/tests/test_unchecked_iterator.cpp b/3rdparty/utfcpp/tests/test_unchecked_iterator.cpp new file mode 100644 index 000000000..103e8e28a --- /dev/null +++ b/3rdparty/utfcpp/tests/test_unchecked_iterator.cpp @@ -0,0 +1,32 @@ +#include "gtest/gtest.h" +#include "utf8/unchecked.h" + +using namespace utf8::unchecked; + + +TEST(UnCheckedIteratrTests, test_increment) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::unchecked::iterator<const char*> it(threechars); + utf8::unchecked::iterator<const char*> it2 = it; + EXPECT_EQ (it2, it); + EXPECT_EQ (*it, 0x10346); + EXPECT_EQ (*(++it), 0x65e5); + EXPECT_EQ ((*it++), 0x65e5); + EXPECT_EQ (*it, 0x0448); + EXPECT_NE (it, it2); + utf8::unchecked::iterator<const char*> endit (threechars + 9); + EXPECT_EQ (++it, endit); +} + +TEST(UnCheckedIteratrTests, test_decrement) +{ + const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + utf8::unchecked::iterator<const char*> it(threechars+9); + EXPECT_EQ (*(--it), 0x0448); + EXPECT_EQ ((*it--), 0x0448); + EXPECT_EQ (*it, 0x65e5); + EXPECT_EQ (--it, utf8::unchecked::iterator<const char*>(threechars)); + EXPECT_EQ (*it, 0x10346); + +} diff --git a/CODING.md b/CODING.md new file mode 100644 index 000000000..3b37ca368 --- /dev/null +++ b/CODING.md @@ -0,0 +1,5 @@ +# Naming Conventions + +* Classes/Structs - PascalCase +* Functions - CamelCase +* Class member variables - m_PascalCase diff --git a/README.md b/README.md new file mode 100644 index 000000000..6854e99c4 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# Zen Storage Service (zenserver) + +This is the implementation of the local storage service for UE5. It is intended to be deployed on +user machines either as a daemon or launched ad hoc as required during of editor/cooker/game startup + +This repo also contains a VFS prototype (zenfs) which is currently not functional, it's a prototype +which has decayed, hopefully it can be revisited at some point. + +## Setup + +To build the code you will need Visual Studio 2019 (we use c++20 features), git and vcpkg. + +* Install Visual Studio 2019 Version 16.9.4 or later (16.10 is recommended as it contains improvements to debug codegen which have a pretty significant impact when iterating in debug mode) +* Install [git](https://git-scm.com/download/win) + +We use vcpkg to manage some libraries. Right now it's not set up on a project local +basis and requires manual bootstrap so you will need to do the following at least once: + +* open up a command line window + * create a `git`/`github` directory somewhere for you to clone repos into + * issue `git clone https://github.com/bionicbeagle/vcpkg.git` and build it using the `bootstrap-vcpkg.bat` script. This git repo is temporary and will change in the future but it should be an easy upgrade when the time comes +* optional: add the `vcpkg` directory you cloned to your PATH to allow invoking vcpkg on the command line +* issue `vcpkg integrate install` + +Now you are ready to start building! + +* clone the `zen` repository if you haven't already + * run `git clone https://github.com/EpicGames/zen.git`, or use Visual Studio integrated git to clone + and open the repo +* open the `zen.sln` VS solution (NOTE: you currently need to run Visual Studio in ADMIN mode since + http.sys requires elevation) + * you can now build and run `zenserver` as usual from Visual Studio + * third-party dependencies will be built the first time via the `vcpkg` integration. This is not as + fast as it could be (it does not go wide) but should only happen on the first build + +# Implementation Notes + +* The implementation currently depends only on a few libraries including the C++ standard library +* It uses exceptions for errors +* It is currently not portable as it uses Windows APIs directly. But as we all know, there is no + portable code, just code that has been ported many times. The plan is to implement support for + MacOS and Linux soon, and some research to enable it has been done +* `zenservice.exe` currently requires elevated access to enable `http.sys` access. This will be relaxed + in the future by offering to use a portable server interface without elevation +* The service endpoints are currently open on all NICs and will respond to requests from any host. This + will be tightened up in the future to require some degree of authentication to satisfy security + requirements + +# Testing + +* There are some test projects + * `zencore-test` exercises unit tests in the zencore project + * `zenserver-test` exercises the zen server itself (functional tests) + +The tests are implemented using [doctest](https://github.com/onqtam/doctest), which is similar to Catch in usage. + +# Coding Standards + +See [Coding.md](Coding.md) + +Run `prepare_commit.bat` before committing code. It ensures all source files are formatted with +clang-format which you will need to install. + +(More helpful instructions needed here :) diff --git a/RESTAPI.md b/RESTAPI.md new file mode 100644 index 000000000..6b072d188 --- /dev/null +++ b/RESTAPI.md @@ -0,0 +1,16 @@ +# REST API + +## Test Service + +Intended to be used for basic connectivity testing. Allows the client to fetch +various kinds of payloads via well-known URIs + +HTTP endpoint: `/test` + +`/test/size/{size}` - verbs: (`GET`) + +## Cache Service + +HTTP endpoint: `/cache` + +`/cache/` diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..f1520d2d4 --- /dev/null +++ b/TODO.md @@ -0,0 +1,44 @@ +# Use-cases + +* Mirage cache +* Editor Domain +* COTF2 +* Target Domain / Build Store + +# General + +* Switch to CMake projects for cross-platform builds? +* Should get rid of stack-dependent RefCount initialization +* Upgrade to CPR 1.6.0 for more efficient downloads +* Implement support for `CbFieldType::CustomById` / `CbFieldType::CustomByName` + +# Upstream Connectivity + +## Jupiter + +* High-performance/concurrency HTTP client (on asio) + +# Peer Connectivity + +* Beacon + +# Downstream Connectivity + +## Runtime + +* High performance HTTP client (layered on asio or UE sockets) + +## Cooker + +## Editor + +## Mirage + +# Local Features + +* VFS for surfacing debugging information + +# TPS + +* nodejs/http_parser +* all the rest (do we need TPS for vcpkg packages?) diff --git a/build.bat b/build.bat new file mode 100644 index 000000000..55efab694 --- /dev/null +++ b/build.bat @@ -0,0 +1,4 @@ +mkdir dockerfiles/bin +robocopy /mir %~dp0x64\release dockerfiles\bin + +docker build -t ucache dockerfiles diff --git a/docs/cpp-coding/00-Table_of_Contents.md b/docs/cpp-coding/00-Table_of_Contents.md new file mode 100644 index 000000000..67008eba9 --- /dev/null +++ b/docs/cpp-coding/00-Table_of_Contents.md @@ -0,0 +1,15 @@ + + 1. [Preface](01-Preface.md) + 2. [Use the Tools Available](02-Use_the_Tools_Available.md) + 3. [Style](03-Style.md) + 4. [Considering Safety](04-Considering_Safety.md) + 5. [Considering Maintainability](05-Considering_Maintainability.md) + 6. [Considering Portability](06-Considering_Portability.md) + 7. [Considering Threadability](07-Considering_Threadability.md) + 8. [Considering Performance](08-Considering_Performance.md) + 9. [Considering Correctness](09-Considering_Correctness.md) + 10. [Enable Scripting](10-Enable_Scripting.md) + 11. [Further Reading](11-Further_Reading.md) + 12. [Final Thoughts](12-Final_Thoughts.md) + + diff --git a/docs/cpp-coding/01-Preface.md b/docs/cpp-coding/01-Preface.md new file mode 100644 index 000000000..fac2b8109 --- /dev/null +++ b/docs/cpp-coding/01-Preface.md @@ -0,0 +1,16 @@ +# Preface + +C++ Best Practices: A Forkable Coding Standards Document + +This document is meant to be a collaborative discussion of the best practices in C++. It complements books such as *Effective C++* (Meyers) and *C++ Coding Standards* (Alexandrescu, Sutter). We fill in some of the lower level details that they don't discuss and provide specific stylistic recommendations while also discussing how to ensure overall code quality. + +In all cases brevity and succinctness is preferred. Examples are preferred for making the case for why one option is preferred over another. If necessary, words will be used. + + +<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">C++ Best Practices</span> by <a xmlns:cc="http://creativecommons.org/ns#" href="http://cppbestpractices.com" property="cc:attributionName" rel="cc:attributionURL">Jason Turner</a> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>. + +*Disclaimer* + +This document is based on my personal experiences. You are not supposed to agree with it 100%. It exists as a book on [GitHub](https://github.com/lefticus/cppbestpractices) so that you can fork it for your own uses or submit back proposed changes for everyone to share. + +This book has inspired an O'Reilly video: [Learning C++ Best Practices](http://shop.oreilly.com/product/0636920049814.do) diff --git a/docs/cpp-coding/02-Use_the_Tools_Available.md b/docs/cpp-coding/02-Use_the_Tools_Available.md new file mode 100644 index 000000000..15c4df76d --- /dev/null +++ b/docs/cpp-coding/02-Use_the_Tools_Available.md @@ -0,0 +1,415 @@ +# Use The Tools Available + +An automated framework for executing these tools should be established very early in the development process. It should not take more than 2-3 commands to checkout the source code, build, and execute the tests. Once the tests are done executing, you should have an almost complete picture of the state and quality of the code. + +## Source Control + +Source control is an absolute necessity for any software development project. If you are not using one yet, start using one. + + * [GitHub](https://github.com/) - allows for unlimited public repositories, and unlimited private repositories with up to 3 collaborators. + * [Bitbucket](https://bitbucket.org/) - allows for unlimited private repositories with up to 5 collaborators, for free. + * [SourceForge](http://sourceforge.net/) - open source hosting only. + * [GitLab](https://gitlab.com/) - allows for unlimited public and private repositories, unlimited CI Runners included, for free. + * [Visual Studio Online](https://visualstudio.com) (http://www.visualstudio.com/what-is-visual-studio-online-vs) - allows for unlimited public repositories, must pay for private repository. Repositories can be git or TFVC. Additionally: Issue tracking, project planning (multiple Agile templates, such as SCRUM), integrated hosted builds, integration of all this into Microsoft Visual Studio. Windows only. + +## Build Tool + +Use an industry standard widely accepted build tool. This prevents you from reinventing the wheel whenever you discover / link to a new library / package your product / etc. Examples include: + + * [Autotools](https://autotools.io) - The traditional GNU build system. + * [CMake](http://www.cmake.org/) + * Consider: https://github.com/sakra/cotire/ for build performance + * Consider: https://github.com/toeb/cmakepp for enhanced usability + * Utilize: https://cmake.org/cmake/help/v3.6/command/target_compile_features.html for C++ standard flags + * Consider: https://github.com/cheshirekow/cmake_format for automatic formatting of your CMakeLists.txt + * See the [Further Reading](11-Further_Reading.md) section for CMake specific best practices + * `cmake --build` provides a common interface for compiling your project regardless of platform + * [Waf](https://waf.io/) + * [FASTBuild](http://www.fastbuild.org/) + * [Ninja](https://ninja-build.org/) - Can greatly improve the incremental build time of your larger projects. Can be used as a target for CMake. + * [Bazel](http://bazel.io/) - Fast incremental builds using network artefact caching and remote execution. + * [Buck](http://buckbuild.com/) - Similar to Bazel, with very good support for iOS and Andoid. + * [gyp](https://chromium.googlesource.com/external/gyp/) - Google's build tool for chromium. + * [maiken](https://github.com/Dekken/maiken) - Crossplatform build tool with Maven-esque configuration style. + * [Qt Build Suite](http://doc.qt.io/qbs/) - Crossplatform build tool From Qt. + * [meson](http://mesonbuild.com/index.html) - Open source build system meant to be both extremely fast, and, even more importantly, as user friendly as possible. + * [premake](https://premake.github.io/) + * [xmake](https://xmake.io) - A cross-platform build utility based on Lua. Modern C/C++ build tools, Support multi-language hybrid compilation + +Remember, it's not just a build tool, it's also a programming language. Try to maintain good clean build scripts and follow the recommended practices for the tool you are using. + +## Package Manager + +Package management is an important topic in C++, with currently no clear winner. Consider using a package manager to help you keep track of the dependencies for your project and make it easier for new people to get started with the project. + + * [Conan](https://www.conan.io/) - a crossplatform dependency manager for C++ + * [hunter](https://github.com/ruslo/hunter) - CMake driven cross-platform package manager for C/C++ + * [C++ Archive Network (CPPAN)](https://cppan.org/) - a crossplatform dependency manager for C++ + * [qpm](https://www.qpm.io/) - Package manager for Qt + * [build2](https://build2.org/) - cargo-like package management for C++ + * [Buckaroo](https://buckaroo.pm) - Truly decentralized cross-platform dependency manager for C/C++ and more + * [Vcpkg](https://github.com/microsoft/vcpkg) - Microsoft C++ Library Manager for Windows, Linux, and MacOS - [description](https://docs.microsoft.com/en-us/cpp/build/vcpkg) + +## Continuous Integration + +Once you have picked your build tool, set up a continuous integration environment. + +Continuous Integration (CI) tools automatically build the source code as changes are pushed to the repository. These can be hosted privately or with a CI host. + + * [Travis CI](http://travis-ci.org) + * works well with C++ + * designed for use with GitHub + * free for public repositories on GitHub + * [AppVeyor](http://www.appveyor.com/) + * supports Windows, MSVC and MinGW + * free for public repositories on GitHub + * [Hudson CI](http://hudson-ci.org/) / [Jenkins CI](https://jenkins-ci.org/) + * Java Application Server is required + * supports Windows, OS X, and Linux + * extendable with a lot of plugins + * [TeamCity](https://www.jetbrains.com/teamcity) + * has a free option for open source projects + * [Decent CI](https://github.com/lefticus/decent_ci) + * simple ad-hoc continuous integration that posts results to GitHub + * supports Windows, OS X, and Linux + * used by [ChaiScript](http://chaiscript.com/ChaiScript-BuildResults/full_dashboard.html) + * [Visual Studio Online](https://visualstudio.com) (http://www.visualstudio.com/what-is-visual-studio-online-vs) + * Tightly integrated with the source repositories from Visual Studio Online + * Uses MSBuild (Visual Studio's build engine), which is available on Windows, OS X and Linux + * Provides hosted build agents and also allows for user-provided build agents + * Can be controlled and monitored from within Microsoft Visual Studio + * On-Premise installation via Microsoft Team Foundation Server + * [GitLab](https://gitlab.com) + * use custom Docker images, so can be used for C++ + * has free shared runners + * has trivial processing of result of coverage analyze + +If you have an open source, publicly-hosted project on GitHub: + + * go enable Travis Ci and AppVeyor integration right now. We'll wait for you to come back. For a simple example of how to enable it for your C++ CMake-based application, see here: https://github.com/ChaiScript/ChaiScript/blob/master/.travis.yml + * enable one of the coverage tools listed below (Codecov or Coveralls) + * enable [Coverity Scan](https://scan.coverity.com) + +These tools are all free and relatively easy to set up. Once they are set up you are getting continuous building, testing, analysis and reporting of your project. For free. + + +## Compilers + +Use every available and reasonable set of warning options. Some warning options only work with optimizations enabled, or work better the higher the chosen level of optimization is, for example [`-Wnull-dereference`](https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wnull-dereference-367) with GCC. + +You should use as many compilers as you can for your platform(s). Each compiler implements the standard slightly differently and supporting multiple will help ensure the most portable, most reliable code. + +### GCC / Clang + +`-Wall -Wextra -Wshadow -Wnon-virtual-dtor -pedantic` + + * `-Wall -Wextra` reasonable and standard + * `-Wshadow` warn the user if a variable declaration shadows one from a parent context + * `-Wnon-virtual-dtor` warn the user if a class with virtual functions has a non-virtual destructor. This helps catch hard to track down memory errors + * `-Wold-style-cast` warn for c-style casts + * `-Wcast-align` warn for potential performance problem casts + * `-Wunused` warn on anything being unused + * `-Woverloaded-virtual` warn if you overload (not override) a virtual function + * `-Wpedantic` (all versions of GCC, Clang >= 3.2) warn if non-standard C++ is used + * `-Wconversion` warn on type conversions that may lose data + * `-Wsign-conversion` (Clang all versions, GCC >= 4.3) warn on sign conversions + * `-Wmisleading-indentation` (only in GCC >= 6.0) warn if indentation implies blocks where blocks do not exist + * `-Wduplicated-cond` (only in GCC >= 6.0) warn if `if` / `else` chain has duplicated conditions + * `-Wduplicated-branches` (only in GCC >= 7.0) warn if `if` / `else` branches have duplicated code + * `-Wlogical-op` (only in GCC) warn about logical operations being used where bitwise were probably wanted + * `-Wnull-dereference` (only in GCC >= 6.0) warn if a null dereference is detected + * `-Wuseless-cast` (only in GCC >= 4.8) warn if you perform a cast to the same type + * `-Wdouble-promotion` (GCC >= 4.6, Clang >= 3.8) warn if `float` is implicit promoted to `double` + * `-Wformat=2` warn on security issues around functions that format output (ie `printf`) + * `-Wlifetime` (only special branch of Clang currently) shows object lifetime issues + +Consider using `-Weverything` and disabling the few warnings you need to on Clang + + +`-Weffc++` warning mode can be too noisy, but if it works for your project, use it also. + +### MSVC + +`/permissive-` - [Enforces standards conformance](https://docs.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance). + +`/W4 /w14640` - use these and consider the following (see descriptions below) + + * `/W4` All reasonable warnings + * `/w14242` 'identfier': conversion from 'type1' to 'type1', possible loss of data + * `/w14254` 'operator': conversion from 'type1:field_bits' to 'type2:field_bits', possible loss of data + * `/w14263` 'function': member function does not override any base class virtual member function + * `/w14265` 'classname': class has virtual functions, but destructor is not virtual instances of this class may not be destructed correctly + * `/w14287` 'operator': unsigned/negative constant mismatch + * `/we4289` nonstandard extension used: 'variable': loop control variable declared in the for-loop is used outside the for-loop scope + * `/w14296` 'operator': expression is always 'boolean_value' + * `/w14311` 'variable': pointer truncation from 'type1' to 'type2' + * `/w14545` expression before comma evaluates to a function which is missing an argument list + * `/w14546` function call before comma missing argument list + * `/w14547` 'operator': operator before comma has no effect; expected operator with side-effect + * `/w14549` 'operator': operator before comma has no effect; did you intend 'operator'? + * `/w14555` expression has no effect; expected expression with side-effect + * `/w14619` pragma warning: there is no warning number 'number' + * `/w14640` Enable warning on thread un-safe static member initialization + * `/w14826` Conversion from 'type1' to 'type_2' is sign-extended. This may cause unexpected runtime behavior. + * `/w14905` wide string literal cast to 'LPSTR' + * `/w14906` string literal cast to 'LPWSTR' + * `/w14928` illegal copy-initialization; more than one user-defined conversion has been implicitly applied + +Not recommended + + * `/Wall` - Also warns on files included from the standard library, so it's not very useful and creates too many extra warnings. + + + +### General + +Start with very strict warning settings from the beginning. Trying to raise the warning level after the project is underway can be painful. + +Consider using the *treat warnings as errors* setting. `/WX` with MSVC, `-Werror` with GCC / Clang + +## LLVM-based tools + +LLVM based tools work best with a build system (such as cmake) that can output a compile command database, for example: + +``` +$ cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON . +``` + +If you are not using a build system like that, you can consider [Build EAR](https://github.com/rizsotto/Bear) which will hook into your build system and generate a compile command database for you. + +CMake now also comes with built-in support for calling `clang-tidy` during [normal compilation](https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html). + + * [include-what-you-use](https://github.com/include-what-you-use), [example results](https://github.com/ChaiScript/ChaiScript/commit/c0bf6ee99dac14a19530179874f6c95255fde173) + * [clang-modernize](http://clang.llvm.org/extra/clang-modernize.html), [example results](https://github.com/ChaiScript/ChaiScript/commit/6eab8ddfe154a4ebbe956a5165b390ee700fae1b) + * [clang-check](http://clang.llvm.org/docs/ClangCheck.html) + * [clang-tidy](http://clang.llvm.org/extra/clang-tidy.html) + +## Static Analyzers + +The best bet is the static analyzer that you can run as part of your automated build system. Cppcheck and clang meet that requirement for free options. + +### Coverity Scan + +[Coverity](https://scan.coverity.com/) has a free (for open source) static analysis toolkit that can work on every commit in integration with [Travis CI](http://travis-ci.org) and [AppVeyor](http://www.appveyor.com/). + +### PVS-Studio + +[PVS-Studio](http://www.viva64.com/en/pvs-studio/) is a tool for bug detection in the source code of programs, written in C, C++ and C#. It is free for personal academic projects, open source non-commercial projects and independent projects of individual developers. It works in Windows and Linux environment. + +### Cppcheck +[Cppcheck](http://cppcheck.sourceforge.net/) is free and open source. It strives for 0 false positives and does a good job at it. Therefore all warnings should be enabled: `--enable=all` + +Notes: + + * For correct work it requires well formed path for headers, so before usage don't forget to pass: `--check-config`. + * Finding unused headers does not work with `-j` more than 1. + * Remember to add `--force` for code with a lot number of `#ifdef` if you need check all of them. + +### cppclean + +[cppclean](https://github.com/myint/cppclean) - Open source static analyzer focused on finding problems in C++ source that slow development of large code bases. + + +### CppDepend + +[CppDepend](https://www.cppdepend.com/) Simplifies managing a complex C/C++ code base by analyzing and visualizing code dependencies, by defining design rules, by doing impact analysis, and comparing different versions of the code. It's free for OSS contributors. + +### Clang's Static Analyzer + +Clang's analyzer's default options are good for the respective platform. It can be used directly [from CMake](http://garykramlich.blogspot.com/2011/10/using-scan-build-from-clang-with-cmake.html). They can also be called via clang-check and clang-tidy from the [LLVM-based Tools](#llvm-based-tools). + +Also, [CodeChecker](https://github.com/Ericsson/CodeChecker) is available as a front-end to clang's static analysis. + +`clang-tidy` can be easily used with Visual Studio via the [Clang Power Tools](https://clangpowertools.com) extension. + +### MSVC's Static Analyzer + +Can be enabled with the `/analyze` [command line option](http://msdn.microsoft.com/en-us/library/ms173498.aspx). For now we will stick with the default options. + +### Flint / Flint++ + +[Flint](https://github.com/facebook/flint) and [Flint++](https://github.com/L2Program/FlintPlusPlus) are linters that analyze C++ code against Facebook's coding standards. + +### OCLint + +[OCLint](http://oclint.org/) is a free, libre and open source static code analysis tool for improving quality of C++ code in many different ways. + +### ReSharper C++ / CLion + +Both of these tools from [JetBrains](https://www.jetbrains.com/cpp/) offer some level of static analysis and automated fixes for common things that can be done better. They have options available for free licenses for open source project leaders. + +### Cevelop + +The Eclipse based [Cevelop](https://www.cevelop.com/) IDE has various static analysis and refactoring / code fix tools available. For example, you can replace macros with C++ `constexprs`, refactor namespaces (extract/inline `using`, qualify name), and refactor your code to C++11's uniform initialization syntax. Cevelop is free to use. + +### Qt Creator + +Qt Creator can plug into the clang static analyzer. + +### clazy + +[clazy](https://github.com/KDE/clazy) is a clang based tool for analyzing Qt usage. + +### IKOS + +[IKOS](https://ti.arc.nasa.gov/opensource/ikos/) is an open source static analyzer, developed by NASA. It is based on the Abstract Interpretation. It is written in C++ and provides an analyzer for C and C++, using LLVM. +The source code is [available on Github](https://github.com/NASA-SW-VnV/ikos). + +## Runtime Checkers + +### Code Coverage Analysis + +A coverage analysis tool shall be run when tests are executed to make sure the entire application is being tested. Unfortunately, coverage analysis requires that compiler optimizations be disabled. This can result in significantly longer test execution times. + + * [Codecov](https://codecov.io/) + * integrates with Travis CI and AppVeyor + * free for open source projects + * [Coveralls](https://coveralls.io/) + * integrates with Travis CI and AppVeyor + * free for open source projects + * [LCOV](http://ltp.sourceforge.net/coverage/lcov.php) + * very configurable + * [Gcovr](http://gcovr.com/) + * [kcov](http://simonkagstrom.github.io/kcov/index.html) + * integrates with codecov and coveralls + * performs code coverage reporting without needing special compiler flags, just by instrumenting debug symbols. + * [OpenCppCoverage](https://github.com/OpenCppCoverage/OpenCppCoverage) - open source coverage reporting tool for Windows. + + +### Heap profiling + + * [Valgrind](http://www.valgrind.org/) + * Valgrind is a runtime code analyzer that can detect memory leaks, race conditions, and other associated problems. It is supported on various Unix platforms. + * [Heaptrack](https://github.com/KDE/heaptrack) + * A profiler created by a Valgrind's Massif developper. Quite similar to Massif with pros and cons over it, way more intuitive though. + * [Dr Memory](http://www.drmemory.org) + * [Memoro](https://epfl-vlsc.github.io/memoro/) - A detailed heap profiler. + +### CPU profiling + + * [Hotspot](https://github.com/KDAB/hotspot) - An intuitive front-end to visualize datas produced by the [perf](https://perf.wiki.kernel.org) CPU profiler. + * [uftrace](https://github.com/namhyung/uftrace) - Can be used to generating function call graphs of a program execution. + +### Reverse engineering tools + + * [Cutter](https://cutter.re/) - A front-end for [Radare2](https://www.radare.org/n/radare2.html). It provides tools such as decompiler, disassembly, graph visualizer, hex editor. + +### GCC / Clang Sanitizers + +These tools provide many of the same features as Valgrind, but built into the compiler. They are easy to use and provide a report of what went wrong. + + * AddressSanitizer + * MemorySanitizer + * ThreadSanitizer + * UndefinedBehaviorSanitizer + +Be aware of the sanitizer options available, including runtime options. https://kristerw.blogspot.com/2018/06/useful-gcc-address-sanitizer-checks-not.html + +### Fuzzy Analyzers + +If your project accepts user defined input, considering running a fuzzy input tester. + +Both of these tools use coverage reporting to find new code execution paths and try to breed novel inputs for your code. They can find crashes, hangs, and inputs you didn't know were considered valid. + + * [american fuzzy lop](http://lcamtuf.coredump.cx/afl/) + * [LibFuzzer](http://llvm.org/docs/LibFuzzer.html) + * [KLEE](http://klee.github.io/) - Can be used to fuzz individual functions + +#### Continuous Fuzzing + +Continuous fuzzing tools exist to run fuzz tests for you with each commit. + + * [Fuzzit](https://fuzzit.dev/) + +### Mutation Testers + +These tools take code executed during unit test runs and mutate the executed code. If the test continues to pass with a mutation in place, then there is likely a flawed test in your suite. + + * [Dextool Mutate](https://github.com/joakim-brannstrom/dextool/tree/master/plugin/mutate) + * [MuCPP](https://neptuno.uca.es/redmine/projects/mucpp-mutation-tool/wiki) + * [mull](https://github.com/mull-project/mull) + * [CCMutator](https://github.com/markus-kusano/CCMutator) + +### Control Flow Guard + +MSVC's [Control Flow Guard](https://msdn.microsoft.com/en-us/library/windows/desktop/mt637065%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396) adds high performance runtime security checks. + +### Checked STL Implementations + + * `_GLIBCXX_DEBUG` with GCC's implementation libstdc++ implementation. See [Krister's blog article](https://kristerw.blogspot.se/2018/03/detecting-incorrect-c-stl-usage.html). + +### Heap Profiling + + * [Memoro](https://epfl-vlsc.github.io/memoro/) - A detailed heap profiler + +## Ignoring Warnings + +If it is determined by team consensus that the compiler or analyzer is warning on something that is either incorrect or unavoidable, the team will disable the specific error to as localized part of the code as possible. + +Be sure to reenable the warning after disabling it for a section of code. You do not want your disabled warnings to [leak into other code](http://www.forwardscattering.org/post/48). + +## Testing + +CMake, mentioned above, has a built in framework for executing tests. Make sure whatever build system you use has a way to execute tests built in. + +To further aid in executing tests, consider a library such as [Google Test](https://github.com/google/googletest), [Catch](https://github.com/philsquared/Catch), [CppUTest](https://github.com/cpputest/cpputest) or [Boost.Test](http://www.boost.org/doc/libs/release/libs/test/) to help you organize the tests. + +### Unit Tests + +Unit tests are for small chunks of code, individual functions which can be tested standalone. + +### Integration Tests + +There should be a test enabled for every feature or bug fix that is committed. See also [Code Coverage Analysis](#code-coverage-analysis). These are tests that are higher level than unit tests. They should still be limited in scope to individual features. + +### Negative Testing + +Don't forget to make sure that your error handling is being tested and works properly as well. This will become obvious if you aim for 100% code coverage. + +## Debugging + +### GDB + +[GDB](https://www.gnu.org/software/gdb/) - The GNU debugger, powerful and widely used. Most IDEs implement an interface to use it. + +### rr + +[rr](http://rr-project.org/) is a free (open source) reverse debugger that supports C++. + +## Other Tools + +### Lizard + +[Lizard](http://www.lizard.ws/) provides a very simple interface for running complexity analysis against a C++ codebase. + +### Metrix++ + +[Metrix++](http://metrixplusplus.sourceforge.net/) can identify and report on the most complex sections of your code. Reducing complex code helps you and the compiler understand it better and optimize it better. + +### ABI Compliance Checker + +[ABI Compliance Checker](http://ispras.linuxbase.org/index.php/ABI_compliance_checker) (ACC) can analyze two library versions and generates a detailed compatibility report regarding API and C++ ABI changes. This can help a library developer spot unintentional breaking changes to ensure backward compatibility. + +### CNCC + +[Customizable Naming Convention Checker](https://github.com/mapbox/cncc) can report on identifiers in your code that do not follow certain naming conventions. + +### ClangFormat + +[ClangFormat](http://clang.llvm.org/docs/ClangFormat.html) can check and correct code formatting to match organizational conventions automatically. [Multipart series](https://engineering.mongodb.com/post/succeeding-with-clangformat-part-1-pitfalls-and-planning/) on utilizing clang-format. + +### SourceMeter + +[SourceMeter](https://www.sourcemeter.com/) offers a free version which provides many different metrics for your code and can also call into cppcheck. + +### Bloaty McBloatface + +[Bloaty McBloatface](https://github.com/google/bloaty) is a binary size analyzer/profiler for unix-like platforms + +### pahole + +[pahole](https://linux.die.net/man/1/pahole) generates data on holes in the packing of data structures and classes in compiled code. It can also the size of structures and how they fit within the system's cache lines. + +### BinSkim + +[BinSkim](https://github.com/Microsoft/binskim) is a binary static analysis tool that provides security and correctness results for Windows Portable Executable and *nix ELF binary formats diff --git a/docs/cpp-coding/03-Style.md b/docs/cpp-coding/03-Style.md new file mode 100644 index 000000000..241f5c733 --- /dev/null +++ b/docs/cpp-coding/03-Style.md @@ -0,0 +1,457 @@ +# Style + +Consistency is the most important aspect of style. The second most important aspect is following a style that the average C++ programmer is used to reading. + +C++ allows for arbitrary-length identifier names, so there's no reason to be terse when naming things. Use descriptive names, and be consistent in the style. + + * `CamelCase` + * `snake_case` + +are common examples. *snake_case* has the advantage that it can also work with spell checkers, if desired. + +## Establishing A Style Guideline + +Whatever style guidelines you establish, be sure to implement a `.clang-format` file that specifies the style you expect. While this cannot help with naming, it is particularly important for an open source project to maintain a consistent style. + +Every IDE and many editors have support for clang-format built in or easily installable with an add-in. + + * VSCode: [Microsoft C/C++ extension for VS Code](https://github.com/Microsoft/vscode-cpptools) + * CLion: https://www.jetbrains.com/help/clion/clangformat-as-alternative-formatter.html + * VisualStudio https://marketplace.visualstudio.com/items?itemName=LLVMExtensions.ClangFormat#review-details + * Resharper++: https://www.jetbrains.com/help/resharper/2017.2/Using_Clang_Format.html + * Vim + * https://github.com/rhysd/vim-clang-format + * https://github.com/chiel92/vim-autoformat + * XCode: https://github.com/travisjeffery/ClangFormat-Xcode + + + +## Common C++ Naming Conventions + + * Types start with upper case: `MyClass`. + * Functions and variables start with lower case: `myMethod`. + * Constants are all upper case: `const double PI=3.14159265358979323;`. + +C++ Standard Library (and other well-known C++ libraries like [Boost](http://www.boost.org/)) use these guidelines: + + * Macro names use upper case with underscores: `INT_MAX`. + * Template parameter names use camel case: `InputIterator`. + * All other names use snake case: `unordered_map`. + +## Distinguish Private Object Data + +Name private data with a `m_` prefix to distinguish it from public data. `m_` stands for "member" data. + +## Distinguish Function Parameters + +The most important thing is consistency within your codebase; this is one possibility to help with consistency. + +Name function parameters with an `t_` prefix. `t_` can be thought of as "the", but the meaning is arbitrary. The point is to distinguish function parameters from other variables in scope while giving us a consistent naming strategy. + +Any prefix or postfix can be chosen for your organization. This is just one example. *This suggestion is controversial, for a discussion about it see issue [#11](https://github.com/lefticus/cppbestpractices/issues/11).* + +```cpp +struct Size +{ + int width; + int height; + + Size(int t_width, int t_height) : width(t_width), height(t_height) {} +}; + +// This version might make sense for thread safety or something, +// but more to the point, sometimes we need to hide data, sometimes we don't. +class PrivateSize +{ + public: + int width() const { return m_width; } + int height() const { return m_height; } + PrivateSize(int t_width, int t_height) : m_width(t_width), m_height(t_height) {} + + private: + int m_width; + int m_height; +}; +``` + + + + +## Don't Name Anything Starting With `_` + +If you do, you risk colliding with names reserved for compiler and standard library implementation use: + +http://stackoverflow.com/questions/228783/what-are-the-rules-about-using-an-underscore-in-a-c-identifier + + +## Well-Formed Example + +```cpp +class MyClass +{ +public: + MyClass(int t_data) + : m_data(t_data) + { + } + + int getData() const + { + return m_data; + } + +private: + int m_data; +}; +``` + + + +## Enable Out-of-Source-Directory Builds + +Make sure generated files go into an output folder that is separate from the source folder. + + +## Use `nullptr` + +C++11 introduces `nullptr` which is a special value denoting a null pointer. This should be used instead of `0` or `NULL` to indicate a null pointer. + +## Comments + +Comment blocks should use `//`, not `/* */`. Using `//` makes it much easier to comment out a block of code while debugging. + +```cpp +// this function does something +int myFunc() +{ +} +``` + +To comment out this function block during debugging we might do: + +```cpp +/* +// this function does something +int myFunc() +{ +} +*/ +``` + +which would be impossible if the function comment header used `/* */`. + +## Never Use `using namespace` in a Header File + +This causes the namespace you are `using` to be pulled into the namespace of all files that include the header file. +It pollutes the namespace and it may lead to name collisions in the future. +Writing `using namespace` in an implementation file is fine though. + + +## Include Guards + +Header files must contain a distinctly-named include guard to avoid problems with including the same header multiple times and to prevent conflicts with headers from other projects. + +```cpp +#ifndef MYPROJECT_MYCLASS_HPP +#define MYPROJECT_MYCLASS_HPP + +namespace MyProject { + class MyClass { + }; +} + +#endif +``` + +You may also consider using the `#pragma once` directive instead which is quasi-standard across many compilers. +It's short and makes the intent clear. + + +## {} Are Required for Blocks. +Leaving them off can lead to semantic errors in the code. + +```cpp +// Bad Idea +// This compiles and does what you want, but can lead to confusing +// errors if modification are made in the future and close attention +// is not paid. +for (int i = 0; i < 15; ++i) + std::cout << i << std::endl; + +// Bad Idea +// The cout is not part of the loop in this case even though it appears to be. +int sum = 0; +for (int i = 0; i < 15; ++i) + ++sum; + std::cout << i << std::endl; + + +// Good Idea +// It's clear which statements are part of the loop (or if block, or whatever). +int sum = 0; +for (int i = 0; i < 15; ++i) { + ++sum; + std::cout << i << std::endl; +} +``` + +## Keep Lines a Reasonable Length + +```cpp +// Bad Idea +// hard to follow +if (x && y && myFunctionThatReturnsBool() && caseNumber3 && (15 > 12 || 2 < 3)) { +} + +// Good Idea +// Logical grouping, easier to read +if (x && y && myFunctionThatReturnsBool() + && caseNumber3 + && (15 > 12 || 2 < 3)) { +} +``` + +Many projects and coding standards have a soft guideline that one should try to use less than about 80 or 100 characters per line. +Such code is generally easier to read. +It also makes it possible to have two separate files next to each other on one screen without having a tiny font. + + +## Use "" for Including Local Files +... `<>` is [reserved for system includes](http://blog2.emptycrate.com/content/when-use-include-verses-include). + +```cpp +// Bad Idea. Requires extra -I directives to the compiler +// and goes against standards. +#include <string> +#include <includes/MyHeader.hpp> + +// Worse Idea +// Requires potentially even more specific -I directives and +// makes code more difficult to package and distribute. +#include <string> +#include <MyHeader.hpp> + + +// Good Idea +// Requires no extra params and notifies the user that the file +// is a local file. +#include <string> +#include "MyHeader.hpp" +``` + +## Initialize Member Variables +...with the member initializer list. + +For POD types, the performance of an initializer list is the same as manual initialization, but for other types there is a clear performance gain, see below. + +```cpp +// Bad Idea +class MyClass +{ +public: + MyClass(int t_value) + { + m_value = t_value; + } + +private: + int m_value; +}; + +// Bad Idea +// This leads to an additional constructor call for m_myOtherClass +// before the assignment. +class MyClass +{ +public: + MyClass(MyOtherClass t_myOtherClass) + { + m_myOtherClass = t_myOtherClass; + } + +private: + MyOtherClass m_myOtherClass; +}; + +// Good Idea +// There is no performance gain here but the code is cleaner. +class MyClass +{ +public: + MyClass(int t_value) + : m_value(t_value) + { + } + +private: + int m_value; +}; + +// Good Idea +// The default constructor for m_myOtherClass is never called here, so +// there is a performance gain if MyOtherClass is not is_trivially_default_constructible. +class MyClass +{ +public: + MyClass(MyOtherClass t_myOtherClass) + : m_myOtherClass(t_myOtherClass) + { + } + +private: + MyOtherClass m_myOtherClass; +}; +``` + +In C++11 you can assign default values to each member (using `=` or using `{}`). + +### Assigning default values with = + +```cpp +// ... // +private: + int m_value = 0; // allowed + unsigned m_value_2 = -1; // narrowing from signed to unsigned allowed +// ... // +``` +This ensures that no constructor ever "forgets" to initialize a member object. + +### Assigning default values with brace initialization + +Using brace initialization does not allow narrowing at compile-time. + +```cpp +// Best Idea + +// ... // +private: + int m_value{ 0 }; // allowed + unsigned m_value_2 { -1 }; // narrowing from signed to unsigned not allowed, leads to a compile time error +// ... // +``` + +Prefer `{}` initialization over `=` unless you have a strong reason not to. + +Forgetting to initialize a member is a source of undefined behavior bugs which are often extremely hard to find. + +If the member variable is not expected to change after the initialization, then mark it `const`. + +```cpp +class MyClass +{ +public: + MyClass(int t_value) + : m_value{t_value} + { + } + +private: + const int m_value{0}; +}; +``` + +Since a const member variable cannot be assigned a new value, such a class may not have a meaningful copy assignment operator. + +## Always Use Namespaces + +There is almost never a reason to declare an identifier in the global namespace. Instead, functions and classes should exist in an appropriately named namespace or in a class inside of a namespace. Identifiers which are placed in the global namespace risk conflicting with identifiers from other libraries (mostly C, which doesn't have namespaces). + + +## Use the Correct Integer Type for Standard Library Features + +The standard library generally uses `std::size_t` for anything related to size. The size of `size_t` is implementation defined. + +In general, using `auto` will avoid most of these issues, but not all. + +Make sure you stick with the correct integer types and remain consistent with the C++ standard library. It might not warn on the platform you are currently using, but it probably will when you change platforms. + +*Note that you can cause integer underflow when performing some operations on unsigned values. For example:* + +```cpp +std::vector<int> v1{2,3,4,5,6,7,8,9}; +std::vector<int> v2{9,8,7,6,5,4,3,2,1}; +const auto s1 = v1.size(); +const auto s2 = v2.size(); +const auto diff = s1 - s2; // diff underflows to a very large number +``` + +## Use .hpp and .cpp for Your File Extensions + +Ultimately this is a matter of preference, but .hpp and .cpp are widely recognized by various editors and tools. So the choice is pragmatic. Specifically, Visual Studio only automatically recognizes .cpp and .cxx for C++ files, and Vim doesn't necessarily recognize .cc as a C++ file. + +One particularly large project ([OpenStudio](https://github.com/NREL/OpenStudio)) uses .hpp and .cpp for user-generated files and .hxx and .cxx for tool-generated files. Both are well recognized and having the distinction is helpful. + +## Never Mix Tabs and Spaces + +Some editors like to indent with a mixture of tabs and spaces by default. This makes the code unreadable to anyone not using the exact same tab indentation settings. Configure your editor so this does not happen. + +## Never Put Code with Side Effects Inside an assert() + +```cpp +assert(registerSomeThing()); // make sure that registerSomeThing() returns true +``` + +The above code succeeds when making a debug build, but gets removed by the compiler when making a release build, giving you different behavior between debug and release builds. +This is because `assert()` is a macro which expands to nothing in release mode. + +## Don't Be Afraid of Templates + +They can help you stick to [DRY principles](http://en.wikipedia.org/wiki/Don%27t_repeat_yourself). +They should be preferred to macros, because macros do not honor namespaces, etc. + +## Use Operator Overloads Judiciously + +Operator overloading was invented to enable expressive syntax. Expressive in the sense that adding two big integers looks like `a + b` and not `a.add(b)`. Another common example is `std::string`, where it is very common to concatenate two strings with `string1 + string2`. + +However, you can easily create unreadable expressions using too much or wrong operator overloading. When overloading operators, there are three basic rules to follow as described [on stackoverflow](http://stackoverflow.com/questions/4421706/operator-overloading/4421708#4421708). + +Specifically, you should keep these things in mind: + +* Overloading `operator=()` when handling resources is a must. See [Consider the Rule of Zero](03-Style.md#consider-the-rule-of-zero) below. +* For all other operators, only overload them when they are used in a context that is commonly connected to these operators. Typical scenarios are concatenating things with +, negating expressions that can be considered "true" or "false", etc. +* Always be aware of the [operator precedence](http://en.cppreference.com/w/cpp/language/operator_precedence) and try to circumvent unintuitive constructs. +* Do not overload exotic operators such as ~ or % unless implementing a numeric type or following a well recognized syntax in specific domain. +* [Never](http://stackoverflow.com/questions/5602112/when-to-overload-the-comma-operator?answertab=votes#tab-top) overload `operator,()` (the comma operator). +* Use non-member functions `operator>>()` and `operator<<()` when dealing with streams. For example, you can overload `operator<<(std::ostream &, MyClass const &)` to enable "writing" your class into a stream, such as `std::cout` or an `std::fstream` or `std::stringstream`. The latter is often used to create a string representation of a value. +* There are more common operators to overload [described here](http://stackoverflow.com/questions/4421706/operator-overloading?answertab=votes#tab-top). + +More tips regarding the implementation details of your custom operators can be found [here](http://courses.cms.caltech.edu/cs11/material/cpp/donnie/cpp-ops.html). + +## Avoid Implicit Conversions + +### Single Parameter Constructors + +Single parameter constructors can be applied at compile time to automatically convert between types. This is handy for things like `std::string(const char *)` but should be avoided in general because they can add to accidental runtime overhead. + +Instead mark single parameter constructors as `explicit`, which requires them to be explicitly called. + +### Conversion Operators + +Similarly to single parameter constructors, conversion operators can be called by the compiler and introduce unexpected overhead. They should also be marked as `explicit`. + +```cpp +//bad idea +struct S { + operator int() { + return 2; + } +}; +``` + +```cpp +//good idea +struct S { + explicit operator int() { + return 2; + } +}; +``` + +## Consider the Rule of Zero + +The Rule of Zero states that you do not provide any of the functions that the compiler can provide (copy constructor, copy assignment operator, move constructor, move assignment operator, destructor) unless the class you are constructing does some novel form of ownership. + +The goal is to let the compiler provide optimal versions that are automatically maintained when more member variables are added. + +[This article](http://www.nirfriedman.com/2015/06/27/cpp-rule-of-zero/) provides a background and explains techniques for implementing nearly 100% of the time. + diff --git a/docs/cpp-coding/04-Considering_Safety.md b/docs/cpp-coding/04-Considering_Safety.md new file mode 100644 index 000000000..713676743 --- /dev/null +++ b/docs/cpp-coding/04-Considering_Safety.md @@ -0,0 +1,145 @@ +# Considering Safety + + +## Const as Much as Possible +`const` tells the compiler that a variable or method is immutable. This helps the compiler optimize the code and helps the developer know if a function has a side effect. Also, using `const &` prevents the compiler from copying data unnecessarily. The [comments on `const` from John Carmack](http://kotaku.com/454293019) are also a good read. + +```cpp +// Bad Idea +class MyClass +{ +public: + void do_something(int i); + void do_something(std::string str); +}; + + +// Good Idea +class MyClass +{ +public: + void do_something(const int i); + void do_something(const std::string &str); +}; + +``` + +### Carefully Consider Your Return Types + + * Getters + * Returning by `&` or `const &` can have significant performance savings when the normal use of the returned value is for observation + * Returning by value is better for thread safety and if the normal use of the returned value is to make a copy anyhow, there's no performance lost + * If your API uses covariant return types, you must return by `&` or `*` + * Temporaries and local values + * Always return by value. + + +references: https://github.com/lefticus/cppbestpractices/issues/21 https://twitter.com/lefticus/status/635943577328095232 + +### Do not pass and return simple types by const ref + +```cpp +// Very Bad Idea +class MyClass +{ +public: + explicit MyClass(const int& t_int_value) + : m_int_value(t_int_value) + { + } + + const int& get_int_value() const + { + return m_int_value; + } + +private: + int m_int_value; +} +``` + +Instead, pass and return simple types by value. If you plan not to change passed value, declare them as `const`, but not `const` refs: + +```cpp +// Good Idea +class MyClass +{ +public: + explicit MyClass(const int t_int_value) + : m_int_value(t_int_value) + { + } + + int get_int_value() const + { + return m_int_value; + } + +private: + int m_int_value; +} +``` + +Why? Because passing and returning by reference leads to pointer operations instead by much more faster passing values in processor registers. + +## Avoid Raw Memory Access + +Raw memory access, allocation and deallocation, are difficult to get correct in C++ without [risking memory errors and leaks](http://blog2.emptycrate.com/content/nobody-understands-c-part-6-are-you-still-using-pointers). C++11 provides tools to avoid these problems. + +```cpp +// Bad Idea +MyClass *myobj = new MyClass; + +// ... +delete myobj; + + +// Good Idea +auto myobj = std::make_unique<MyClass>(constructor_param1, constructor_param2); // C++14 +auto myobj = std::unique_ptr<MyClass>(new MyClass(constructor_param1, constructor_param2)); // C++11 +auto mybuffer = std::make_unique<char[]>(length); // C++14 +auto mybuffer = std::unique_ptr<char[]>(new char[length]); // C++11 + +// or for reference counted objects +auto myobj = std::make_shared<MyClass>(); + +// ... +// myobj is automatically freed for you whenever it is no longer used. +``` + +## Use `std::array` or `std::vector` Instead of C-style Arrays + +Both of these guarantee contiguous memory layout of objects and can (and should) completely replace your usage of C-style arrays for many of the reasons listed for not using bare pointers. + +Also, [avoid](http://stackoverflow.com/questions/3266443/can-you-use-a-shared-ptr-for-raii-of-c-style-arrays) using `std::shared_ptr` to hold an array. + +## Use Exceptions + +Exceptions cannot be ignored. Return values, such as using `boost::optional`, can be ignored and if not checked can cause crashes or memory errors. An exception, on the other hand, can be caught and handled. Potentially all the way up the highest level of the application with a log and automatic restart of the application. + +Stroustrup, the original designer of C++, [makes this point](http://www.stroustrup.com/bs_faq2.html#exceptions-why) much better than I ever could. + +## Use C++-style cast instead of C-style cast +Use the C++-style cast (static\_cast<>, dynamic\_cast<> ...) instead of the C-style cast. The C++-style cast allows more compiler checks and is considerably safer. + +```cpp +// Bad Idea +double x = getX(); +int i = (int) x; + +// Not a Bad Idea +int i = static_cast<int>(x); +``` +Additionally the C++ cast style is more visible and has the possibility to search for. + +But consider refactoring of program logic (for example, additional checking on overflow and underflow) if you need to cast `double` to `int`. Measure three times and cut 0.9999999999981 times. + +## Do not define a variadic function +Variadic functions can accept a variable number of parameters. The probably best known example is printf(). You have the possibility to define this kind of functions by yourself but this is a possible security risk. The usage of variadic functions is not type safe and the wrong input parameters can cause a program termination with an undefined behavior. This undefined behavior can be exploited to a security problem. +If you have the possibility to use a compiler that supports C++11, you can use variadic templates instead. + +[It is technically possible to make typesafe C-style variadic functions with some compilers](https://github.com/lefticus/cppbestpractices/issues/53) + +## Additional Resources + +[How to Prevent The Next Heartbleed](http://www.dwheeler.com/essays/heartbleed.html) by David Wheeler is a good analysis of the current state of code safety and how to ensure safe code. diff --git a/docs/cpp-coding/05-Considering_Maintainability.md b/docs/cpp-coding/05-Considering_Maintainability.md new file mode 100644 index 000000000..4547559e5 --- /dev/null +++ b/docs/cpp-coding/05-Considering_Maintainability.md @@ -0,0 +1,58 @@ +# Considering Maintainability + + +## Avoid Compiler Macros + +Compiler definitions and macros are replaced by the preprocessor before the compiler is ever run. This can make debugging very difficult because the debugger doesn't know where the source came from. + +```cpp +// Bad Idea +#define PI 3.14159; + +// Good Idea +namespace my_project { + class Constants { + public: + // if the above macro would be expanded, then the following line would be: + // static const double 3.14159 = 3.14159; + // which leads to a compile-time error. Sometimes such errors are hard to understand. + static constexpr double PI = 3.14159; + }; +} +``` + +## Consider Avoiding Boolean Parameters + +They do not provide any additional meaning while reading the code. You can either create a separate function that has a more meaningful name, or pass an enumeration that makes the meaning more clear. + +See http://mortoray.com/2015/06/15/get-rid-of-those-boolean-function-parameters/ for more information. + +## Avoid Raw Loops + +Know and understand the existing C++ standard algorithms and put them to use. + + * See [cppreference](https://en.cppreference.com/w/cpp/algorithm) + * Watch [C++ Seasoning](https://www.youtube.com/watch?v=qH6sSOr-yk8) + +Consider a call to `[]` as a potential code smell, indicating that an algorithm was not used where it could have been. + + +## Never Use `assert` With Side Effects + +```cpp +// Bad Idea +assert(set_value(something)); + +// Better Idea +[[maybe_unused]] const auto success = set_value(something); +assert(success); +``` + +The `assert()` will be removed in release builds which will prevent the `set_value` call from ever happening. + +So while the second version is uglier, the first version is simply not correct. + + +## Properly Utilize 'override' and 'final' + +These keywords make it clear to other developers how virtual functions are being utilized, can catch potential errors if the signature of a virtual function changes, and can possibly [hint to the compiler](http://stackoverflow.com/questions/7538820/how-does-the-compiler-benefit-from-cs-new-final-keyword) of optimizations that can be performed. diff --git a/docs/cpp-coding/06-Considering_Portability.md b/docs/cpp-coding/06-Considering_Portability.md new file mode 100644 index 000000000..5fd89ef10 --- /dev/null +++ b/docs/cpp-coding/06-Considering_Portability.md @@ -0,0 +1,21 @@ +# Considering Portability + +## Know Your Types + +Most portability issues that generate warnings are because we are not careful about our types. Standard library and arrays are indexed with `size_t`. Standard container sizes are reported in `size_t`. If you get the handling of `size_t` wrong, you can create subtle lurking 64-bit issues that arise only after you start to overflow the indexing of 32-bit integers. char vs unsigned char. + +http://www.viva64.com/en/a/0010/ + +## Use The Standard Library + +### `std::filesystem` + +C++17 added a new `filesystem` library which provides portable filesystem access across all supporting compilers + +### `std::thread` + +C++11's threading capabilities should be utilized over `pthread` or `WinThreads`. + +## Other Concerns + +Most of the other concerns in this document ultimately come back to portability issues. [Avoid statics](07-Considering_Threadability.md#statics) is particularly of note. diff --git a/docs/cpp-coding/07-Considering_Threadability.md b/docs/cpp-coding/07-Considering_Threadability.md new file mode 100644 index 000000000..a6b9f3444 --- /dev/null +++ b/docs/cpp-coding/07-Considering_Threadability.md @@ -0,0 +1,30 @@ +# Considering Threadability + +## Avoid Global Data + +Global data leads to unintended side effects between functions and can make code difficult or impossible to parallelize. Even if the code is not intended today for parallelization, there is no reason to make it impossible for the future. + +### Statics + +Besides being global data, statics are not always constructed and deconstructed as you would expect. This is particularly true in cross-platform environments. See for example, [this g++ bug](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66830) regarding the order of destruction of shared static data loaded from dynamic modules. + +### Shared Pointers + +`std::shared_ptr` is "as good as a global" (http://stackoverflow.com/a/18803611/29975) because it allows multiple pieces of code to interact with the same data. + +### Singletons + +A singleton is often implemented with a static and/or `shared_ptr`. + +## Avoid Heap Operations + +Much slower in threaded environments. In many or maybe even most cases, copying data is faster. Plus with move operations and such and things. + +## Mutex and mutable go together (M&M rule, C++11) +For member variables it is good practice to use mutex and mutable together. This applies in both ways: +* A mutable member variable is presumed to be a shared variable so it should be synchronized with a mutex (or made atomic) +* If a member variable is itself a mutex, it should be mutable. This is required to use it inside a const member function. + +For more information see the following article from Herb Sutter: http://herbsutter.com/2013/05/24/gotw-6a-const-correctness-part-1-3/ + +See also [related safety discussion](04-Considering_Safety.md#consider-return-by-value-for-mutable-data-const--for-immutable) about `const &` return values diff --git a/docs/cpp-coding/08-Considering_Performance.md b/docs/cpp-coding/08-Considering_Performance.md new file mode 100644 index 000000000..784ca33da --- /dev/null +++ b/docs/cpp-coding/08-Considering_Performance.md @@ -0,0 +1,356 @@ +# Considering Performance + +## Build Time + + + +### Forward Declare When Possible + +This: + +```cpp +// some header file +class MyClass; + +void doSomething(const MyClass &); +``` + +instead of: + +```cpp +// some header file +#include "MyClass.hpp" + +void doSomething(const MyClass &); +``` + + +This applies to templates as well: + +```cpp +template<typename T> class MyTemplatedType; +``` + +This is a proactive approach to reduce compilation time and rebuilding dependencies. + +*Note: forward declaration does prevent more inlining and optimizations. It's recommended to use Link Time Optimization or Link Time Code Generation for release builds.* + +### Avoid Unnecessary Template Instantiations + +Templates are not free to instantiate. Instantiating many templates, or templates with more code than necessary increases compiled code size and build time. + +For more examples see [this article](http://blog2.emptycrate.com/content/template-code-bloat-revisited-smaller-makeshared). + +### Avoid Recursive Template Instantiations + +Recursive template instantiations can result in a significant load on the compiler and more difficult to understand code. + +[Consider using variadic expansions and folds when possible instead.](http://articles.emptycrate.com/2016/05/14/folds_in_cpp11_ish.html) + +### Analyze the Build + +The tool [Templight](https://github.com/mikael-s-persson/templight) can be used to analyze the build time of your project. It takes some effort to get built, but once you do, it's a drop in replacement for clang++. + +After you build using Templight, you will need to analyze the results. The [templight-tools](https://github.com/mikael-s-persson/templight-tools) project provides various methods. (Author's Note: I suggest using the callgrind converter and visualizing the results with kcachegrind). + + + +### Firewall Frequently Changing Header Files + + + +#### Don't Unnecessarily Include Headers + +The compiler has to do something with each include directive it sees. Even if it stops as soon as it sees the `#ifndef` include guard, it still had to open the file and begin processing it. + +[include-what-you-use](https://github.com/include-what-you-use/include-what-you-use) is a tool that can help you identify which headers you need. + +#### Reduce the load on the preprocessor + +This is a general form of "Firewall Frequently Changing Header Files" and "Don't Unnecessarily Include Headers." Tools like BOOST_PP can be very helpful, but they also put a huge burden on the preprocessor. + +### Consider using precompiled headers + +The usage of precompiled headers can considerably reduce the compile time in large projects. Selected headers are compiled to an intermediate form (PCH files) that can be faster processed by the compiler. It is recommended to define only frequently used header that changes rarely as precompiled header (e.g. system and library headers) to achieve the compile time reduction. +But you have to keep in mind, that using precompiled headers has several disadvantages: +* The usage of precompiled header is not portable. +* The generated PCH files are machine dependent. +* The generated PCH files can be quite large. +* It can break your header dependencies. Because of the precompiled headers, every file has the possibility to include every header that is marked as a precompiled header. In result it can happen, that the build fails if you disable the precompiled headers. This can be an issue if you ship something like a library. Because of this it is highly recommend to build once with precompiled header enabled and a second time without them. + +Precompiled headers is supported by the most common compiler, like [GCC](https://gcc.gnu.org/onlinedocs/gcc/Precompiled-Headers.html), [Clang](http://clang.llvm.org/docs/PCHInternals.html) and [Visual Studio](https://msdn.microsoft.com/en-us/library/szfdksca.aspx). +Tools like [cotire](https://github.com/sakra/cotire/) (a plugin for cmake) can help you to add precompiled headers to your build system. + +### Consider Using Tools + +These are not meant to supersede good design + + * [ccache](https://ccache.samba.org/), compile results caching for unix-like operating systems + * [clcache](https://github.com/frerich/clcache), compile results caching for cl.exe (MSVC) + * [warp](https://github.com/facebook/warp), Facebook's preprocessor + +### Put tmp on Ramdisk + +See [this](https://www.youtube.com/watch?v=t4M3yG1dWho) YouTube video for more details. + +### Use the gold linker + +If on Linux, consider using the gold linker for GCC. + +## Runtime + +### Analyze the Code! + +There's no real way to know where your bottlenecks are without analyzing the code. + + * http://developer.amd.com/tools-and-sdks/opencl-zone/codexl/ + * http://www.codersnotes.com/sleepy + +### Simplify the Code + +The cleaner, simpler, and easier to read the code is, the better chance the compiler has at implementing it well. + +### Use Initializer Lists + +```cpp +// This +std::vector<ModelObject> mos{mo1, mo2}; + +// -or- +auto mos = std::vector<ModelObject>{mo1, mo2}; +``` + +```cpp +// Don't do this +std::vector<ModelObject> mos; +mos.push_back(mo1); +mos.push_back(mo2); +``` + +Initializer lists are significantly more efficient; reducing object copies and resizing of containers. + +### Reduce Temporary Objects + +```cpp +// Instead of +auto mo1 = getSomeModelObject(); +auto mo2 = getAnotherModelObject(); + +doSomething(mo1, mo2); +``` + +```cpp +// consider: + +doSomething(getSomeModelObject(), getAnotherModelObject()); +``` + +This sort of code prevents the compiler from performing a move operation... + +### Enable move operations + +Move operations are one of the most touted features of C++11. They allow the compiler to avoid extra copies by moving temporary objects instead of copying them in certain cases. + +Certain coding choices we make (such as declaring our own destructor or assignment operator or copy constructor) prevents the compiler from generating a move constructor. + +For most code, a simple + +```cpp +ModelObject(ModelObject &&) = default; +``` + +would suffice. However, MSVC2013 doesn't seem to like this code yet. + +### Kill `shared_ptr` Copies + +`shared_ptr` objects are much more expensive to copy than you'd think they would be. This is because the reference count must be atomic and thread-safe. So this comment just re-enforces the note above: avoid temporaries and too many copies of objects. Just because we are using a pImpl it does not mean our copies are free. + +### Reduce Copies and Reassignments as Much as Possible + +For more simple cases, the ternary operator can be used: + +```cpp +// Bad Idea +std::string somevalue; + +if (caseA) { + somevalue = "Value A"; +} else { + somevalue = "Value B"; +} +``` + +```cpp +// Better Idea +const std::string somevalue = caseA ? "Value A" : "Value B"; +``` + +More complex cases can be facilitated with an [immediately-invoked lambda](http://blog2.emptycrate.com/content/complex-object-initialization-optimization-iife-c11). + +```cpp +// Bad Idea +std::string somevalue; + +if (caseA) { + somevalue = "Value A"; +} else if(caseB) { + somevalue = "Value B"; +} else { + somevalue = "Value C"; +} +``` + +```cpp +// Better Idea +const std::string somevalue = [&](){ + if (caseA) { + return "Value A"; + } else if (caseB) { + return "Value B"; + } else { + return "Value C"; + } + }(); +``` + + +### Avoid Excess Exceptions + +Exceptions which are thrown and captured internally during normal processing slow down the application execution. They also destroy the user experience from within a debugger, as debuggers monitor and report on each exception event. It is best to just avoid internal exception processing when possible. + +### Get rid of “new” + +We already know that we should not be using raw memory access, so we are using `unique_ptr` and `shared_ptr` instead, right? +Heap allocations are much more expensive than stack allocations, but sometimes we have to use them. To make matters worse, creating a `shared_ptr` actually requires 2 heap allocations. + +However, the `make_shared` function reduces this down to just one. + +```cpp +std::shared_ptr<ModelObject_Impl>(new ModelObject_Impl()); + +// should become +std::make_shared<ModelObject_Impl>(); // (it's also more readable and concise) +``` + +### Prefer `unique_ptr` to `shared_ptr` + +If possible use `unique_ptr` instead of `shared_ptr`. The `unique_ptr` does not need to keep track of its copies because it is not copyable. Because of this it is more efficient than the `shared_ptr`. Equivalent to `shared_ptr` and `make_shared` you should use `make_unique` (C++14 or greater) to create the `unique_ptr`: + +```cpp +std::make_unique<ModelObject_Impl>(); +``` + +Current best practices suggest returning a `unique_ptr` from factory functions as well, then converting the `unique_ptr` to a `shared_ptr` if necessary. + +```cpp +std::unique_ptr<ModelObject_Impl> factory(); + +auto shared = std::shared_ptr<ModelObject_Impl>(factory()); +``` + +### Get rid of std::endl + +`std::endl` implies a flush operation. It's equivalent to `"\n" << std::flush`. + + +### Limit Variable Scope + +Variables should be declared as late as possible, and ideally only when it's possible to initialize the object. Reduced variable scope results in less memory being used, more efficient code in general, and helps the compiler optimize the code further. + +```cpp +// Good Idea +for (int i = 0; i < 15; ++i) +{ + MyObject obj(i); + // do something with obj +} + +// Bad Idea +MyObject obj; // meaningless object initialization +for (int i = 0; i < 15; ++i) +{ + obj = MyObject(i); // unnecessary assignment operation + // do something with obj +} +// obj is still taking up memory for no reason +``` + +For C++17 and onwards, consider using init-statement in the `if` and `switch` statements: + +```cpp +if (MyObject obj(index); obj.good()) { + // do something if obj is good +} else { + // do something if obj is not good +} +``` + +[This topic has an associated discussion thread](https://github.com/lefticus/cppbestpractices/issues/52). + +### Prefer `double` to `float`, But Test First + +Depending on the situation and the compiler's ability to optimize, one may be faster over the other. Choosing `float` will result in lower precision and may be slower due to conversions. On vectorizable operations `float` may be faster if you are able to sacrifice precision. + +`double` is the recommended default choice as it is the default type for floating point values in C++. + +See this [stackoverflow](http://stackoverflow.com/questions/4584637/double-or-float-which-is-faster) discussion for some more information. + +### Prefer `++i` to `i++` +... when it is semantically correct. Pre-increment is [faster](http://blog2.emptycrate.com/content/why-i-faster-i-c) than post-increment because it does not require a copy of the object to be made. + +```cpp +// Bad Idea +for (int i = 0; i < 15; i++) +{ + std::cout << i << '\n'; +} + +// Good Idea +for (int i = 0; i < 15; ++i) +{ + std::cout << i << '\n'; +} +``` + +Even if many modern compilers will optimize these two loops to the same assembly code, it is still good practice to prefer `++i`. There is absolutely no reason not to and you can never be certain that your code will not pass a compiler that does not optimize this. +You should be also aware that the compiler will not be able optimize this only for integer types and not necessarily for all iterator or other user defined types. +The bottom line is that it is always easier and recommended to use the pre-increment operator if it is semantically identical to the post-increment operator. + +### Char is a char, string is a string + +```cpp +// Bad Idea +std::cout << someThing() << "\n"; + +// Good Idea +std::cout << someThing() << '\n'; +``` + +This is very minor, but a `"\n"` has to be parsed by the compiler as a `const char *` which has to do a range check for `\0` when writing it to the stream (or appending to a string). A '\n' is known to be a single character and avoids many CPU instructions. + +If used inefficiently very many times it might have an impact on your performance, but more importantly thinking about these two usage cases gets you thinking more about what the compiler and runtime has to do to execute your code. + + +### Never Use `std::bind` + +`std::bind` is almost always way more overhead (both compile time and runtime) than you need. Instead simply use a lambda. + +```cpp +// Bad Idea +auto f = std::bind(&my_function, "hello", std::placeholders::_1); +f("world"); + +// Good Idea +auto f = [](const std::string &s) { return my_function("hello", s); }; +f("world"); +``` + + +### Know The Standard Library + +Properly use the already highly optimized components of the vendor provided standard library. + +#### `in_place_t` And Related + +Be aware of how to use `in_place_t` and related tags for efficient creation of objects such as `std::tuple`, `std::any` and `std::variant`. + diff --git a/docs/cpp-coding/09-Considering_Correctness.md b/docs/cpp-coding/09-Considering_Correctness.md new file mode 100644 index 000000000..5bc8b61ec --- /dev/null +++ b/docs/cpp-coding/09-Considering_Correctness.md @@ -0,0 +1,30 @@ +# Considering Correctness + +## Avoid Typeless Interfaces + + +Bad Idea: + +```cpp +std::string find_file(const std::string &base, const std::string &pattern); +``` + +Better Idea: + +```cpp +std::filesystem::path find_file(const std::filesystem::path &base, const std::regex &pattern); +``` + +The above is better but still suffers from having implicit conversions from `std::string` to `std::filesystem::path` and back. + +Consider using a typesafe library like + + * https://foonathan.net/type_safe/ + * https://github.com/rollbear/strong_type + * https://github.com/joboccara/NamedType + +Note that stronger typing can also allow for more compiler optimizations. + +* [Sorting in C vs C++](Sorting in C vs C++.pdf) + + diff --git a/docs/cpp-coding/10-Enable_Scripting.md b/docs/cpp-coding/10-Enable_Scripting.md new file mode 100644 index 000000000..e22724b3e --- /dev/null +++ b/docs/cpp-coding/10-Enable_Scripting.md @@ -0,0 +1,12 @@ +# Enable Scripting + +The combination of scripting and compiled languages is very powerful. It gives us the things we've come to love about compiled languages: type safety, performance, thread safety options, consistent memory model while also giving us the flexibility to try something new quickly without a full rebuild. + +The VM based compiled languages have learned this already: JRuby, Jython, IronRuby, IronPython + + * [ChaiScript](http://chaiscript.com/) + * [AngelScript](http://www.angelcode.com/angelscript/) + * [luabind](http://www.rasterbar.com/products/luabind.html) + * [sol2](https://github.com/ThePhD/sol2) (bindings for Lua) + * [SWIG](http://www.swig.org/) (simplified wrapper and interface generator) + * [pybind11](https://pybind11.readthedocs.io/en/stable/) (Python and modern C++ interoperability) diff --git a/docs/cpp-coding/11-Further_Reading.md b/docs/cpp-coding/11-Further_Reading.md new file mode 100644 index 000000000..515e16fc2 --- /dev/null +++ b/docs/cpp-coding/11-Further_Reading.md @@ -0,0 +1,29 @@ +# Further Reading + +*Note: This book has now inspired a video series from O'Reilly, [Learning C++ Best Practices](http://shop.oreilly.com/product/0636920049814.do)* + +## C++ + + * https://github.com/isocpp/CppCoreGuidelines The C++ Core Guidelines are a set of tried-and-true guidelines, rules, and best practices about coding in C++ + * https://www.gitbook.com/book/alexastva/the-ultimate-question-of-programming-refactoring-/details - The Ultimate Question of Programming, Refactoring, and Everything + * http://llvm.org/docs/CodingStandards.html - LLVM Coding Standards - very well written + * http://geosoft.no/development/cppstyle.html + * https://google.github.io/styleguide/cppguide.html (Note that Google's standard document makes several recommendations which we will NOT be following. For example, they explicitly forbid the use of exceptions, which makes [RAII](http://blog2.emptycrate.com/content/nobody-understands-c-part-2-raii) impossible.) + * https://isocpp.org/faq/ + * http://www.cplusplus.com/ + * http://www.gamasutra.com/view/news/128836/InDepth_Static_Code_Analysis.php - Article from John Carmack on the advantages of static analysis + * https://svn.boost.org/trac/boost/wiki/BestPracticeHandbook - Best Practice Handbook from Nial Douglas + * http://sourceforge.net/apps/mediawiki/cppcheck/index.php?title=ListOfChecks + * http://emptycrate.com/ + * http://stackoverflow.com/questions/tagged/c%2b%2b-faq?sort=votes&pageSize=15 - StackOverflow C++ FAQ + * http://codergears.com/qacenter/ discussion center for C and C++ best practices + * http://www.viva64.com/en/b/0391/ The Ultimate Question of Programming, Refactoring, and Everything + +## CMake + + * https://cmake.org/cmake/help/latest/manual/cmake.1.html - Be aware that there are `--warn` command line options for CMake that can catch some issues. + * https://github.com/Akagi201/learning-cmake + * https://codingnest.com/basic-cmake/ + * https://gist.github.com/mbinna/c61dbb39bca0e4fb7d1f73b0d66a4fd1 - Effective CMake online book + * https://pabloariasal.github.io/2018/02/19/its-time-to-do-cmake-right/ + * https://cliutils.gitlab.io/modern-cmake/ - An Introduction to Modern CMake diff --git a/docs/cpp-coding/12-Final_Thoughts.md b/docs/cpp-coding/12-Final_Thoughts.md new file mode 100644 index 000000000..e7f711bf3 --- /dev/null +++ b/docs/cpp-coding/12-Final_Thoughts.md @@ -0,0 +1,4 @@ +# Final Thoughts + +Expand your horizons and use other programming languages. Other languages have different constructs and expressions. Learning what else is out there will encourage you to be more creative with your C++ and write cleaner, more expressive code. + diff --git a/docs/cpp-coding/SUMMARY.md b/docs/cpp-coding/SUMMARY.md new file mode 100644 index 000000000..6ecc4de54 --- /dev/null +++ b/docs/cpp-coding/SUMMARY.md @@ -0,0 +1,15 @@ +# Summary + +* [Preface](01-Preface.md) +* [Use the Tools Available](02-Use_the_Tools_Available.md) +* [Style](03-Style.md) +* [Considering Safety](04-Considering_Safety.md) +* [Considering Maintainability](05-Considering_Maintainability.md) +* [Considering Portability](06-Considering_Portability.md) +* [Considering Threadability](07-Considering_Threadability.md) +* [Considering Performance](08-Considering_Performance.md) +* [Considering Correctness](09-Considering_Correctness.md) +* [Enable Scripting](10-Enable_Scripting.md) +* [Further Reading](11-Further_Reading.md) +* [Final Thoughts](12-Final_Thoughts.md) + diff --git a/prepare_commit.bat b/prepare_commit.bat new file mode 100644 index 000000000..97bcea6ed --- /dev/null +++ b/prepare_commit.bat @@ -0,0 +1 @@ +python %~dp0\\scripts\formatcode.py diff --git a/scripts/formatcode.py b/scripts/formatcode.py new file mode 100644 index 000000000..1a214380d --- /dev/null +++ b/scripts/formatcode.py @@ -0,0 +1,49 @@ +import os +import fileinput + +def is_header_missing(f): + with open(f) as reader: + lines = reader.read().lstrip().splitlines() + if len(lines) > 0: return not lines[0].startswith("// ") + return True + +def add_headers(files, header): + for line in fileinput.input(files, inplace=True): + if fileinput.isfirstline(): + [ print(h) for h in header.splitlines() ] + print(line, end="") + +def scan_tree(root): + files = [] + header_files = [] + with os.scandir(root) as dirs: + for entry in dirs: + if entry.is_dir(): + scan_tree(os.path.join(root, entry.name)) + elif entry.name.endswith(".cpp") or entry.name.endswith(".h"): + print("... formatting: %s"%(entry.name)) + full_path = os.path.join(root, entry.name) + files.append(full_path) + if is_header_missing(full_path): + header_files.append(full_path) + args = "" + if files: + os.system("clang-format -i " + " ".join(files)) + if header_files: + add_headers(header_files, "// Copyright Epic Games, Inc. All Rights Reserved.\n\n") + +def scan_zen(root): + with os.scandir(root) as dirs: + for entry in dirs: + if entry.is_dir() and entry.name.startswith("zen"): + scan_tree(os.path.join(root, entry.name)) + +while True: + if (os.path.isfile(".clang-format")): + scan_zen(".") + quit() + else: + cwd = os.getcwd() + if os.path.dirname(cwd) == cwd: + quit() + os.chdir("..") diff --git a/scripts/installdeps.bat b/scripts/installdeps.bat new file mode 100644 index 000000000..dab9c2c67 --- /dev/null +++ b/scripts/installdeps.bat @@ -0,0 +1 @@ +vcpkg --x-manifest-root=%~dp0.. --triplet=x64-windows-static install diff --git a/scripts/testsetup.bat b/scripts/testsetup.bat new file mode 100644 index 000000000..3adabca29 --- /dev/null +++ b/scripts/testsetup.bat @@ -0,0 +1,18 @@ +@echo off + +rem This script can be used to add the build output folder (Release) to the path for easier +rem ad hoc testing from the command line + +CALL :NORMALIZEPATH "%~dp0..\x64\Release" + +SET _BINPATH=%RETVAL% + +path|find /i "%_BINPATH%" >nul || set PATH=%PATH%;%_BINPATH% + +SET _BINPATH= + +EXIT /B + +:NORMALIZEPATH + SET RETVAL=%~f1 + EXIT /B diff --git a/setup_demo.bat b/setup_demo.bat new file mode 100644 index 000000000..7157e8ceb --- /dev/null +++ b/setup_demo.bat @@ -0,0 +1,10 @@ +@echo off +rem Add path so commands can be easily executed + +path %PATH%;%~dp0%x64\Release + +echo Path to x64 release executables added +echo . +echo use zenfs commands to create and manage snapshots +echo . +echo zenfssvc runs the zenfs service diff --git a/test-data/test1/A.txt b/test-data/test1/A.txt new file mode 100644 index 000000000..8c7e5a667 --- /dev/null +++ b/test-data/test1/A.txt @@ -0,0 +1 @@ +A
\ No newline at end of file diff --git a/test-data/test1/ABC.txt b/test-data/test1/ABC.txt new file mode 100644 index 000000000..48b83b862 --- /dev/null +++ b/test-data/test1/ABC.txt @@ -0,0 +1 @@ +ABC
\ No newline at end of file diff --git a/test-data/test1/B.txt b/test-data/test1/B.txt new file mode 100644 index 000000000..7371f47a6 --- /dev/null +++ b/test-data/test1/B.txt @@ -0,0 +1 @@ +B
\ No newline at end of file diff --git a/test-data/test1/C.txt b/test-data/test1/C.txt new file mode 100644 index 000000000..96d80cd6c --- /dev/null +++ b/test-data/test1/C.txt @@ -0,0 +1 @@ +C
\ No newline at end of file diff --git a/test-data/test1/dir/D.txt b/test-data/test1/dir/D.txt new file mode 100644 index 000000000..02358d235 --- /dev/null +++ b/test-data/test1/dir/D.txt @@ -0,0 +1 @@ +D
\ No newline at end of file diff --git a/test-data/test1/dir/E.txt b/test-data/test1/dir/E.txt new file mode 100644 index 000000000..9fb75b8d4 --- /dev/null +++ b/test-data/test1/dir/E.txt @@ -0,0 +1 @@ +E
\ No newline at end of file diff --git a/test-data/test1/dir/F.txt b/test-data/test1/dir/F.txt new file mode 100644 index 000000000..c137216fe --- /dev/null +++ b/test-data/test1/dir/F.txt @@ -0,0 +1 @@ +F
\ No newline at end of file diff --git a/test/minio.bat b/test/minio.bat new file mode 100644 index 000000000..1ca1fb591 --- /dev/null +++ b/test/minio.bat @@ -0,0 +1,3 @@ +set MINIO_ROOT_USER admin +set MINIO_ROOT_PASSWORD password +%~dp0\bin\minio.exe server %~dp0\..\.minio_data diff --git a/test/remote1/zentest-appstub.exe b/test/remote1/zentest-appstub.exe Binary files differnew file mode 100644 index 000000000..5fa38090f --- /dev/null +++ b/test/remote1/zentest-appstub.exe diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 000000000..573e5b4bc --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,31 @@ +{ + "name": "zenfs", + "version-string": "0.1.0-dev", + "dependencies": [ + "asio", + "curl", + "cpr", + "openssl", + "json11", + "mimalloc", + "spdlog", + "lz4", + "zstd", + "xxhash", + "gsl-lite", + "restinio", + "cxxopts", + "doctest", + "modp-base64", + "ryml", + "http-parser", + "lmdb", + "robin-map", + { + "name": "rocksdb", + "features": [ "lz4", "zstd" ] + }, + "sol2", + "breakpad" + ] +} diff --git a/vs-chromium-project.txt b/vs-chromium-project.txt new file mode 100644 index 000000000..2b2e15bc7 --- /dev/null +++ b/vs-chromium-project.txt @@ -0,0 +1,8 @@ +[SearchableFiles.include] +* + +[SourceExplorer.ignore] +.git/ +.x64/ +*.suo +**/.x64/ diff --git a/zen.sln b/zen.sln new file mode 100644 index 000000000..b581424bd --- /dev/null +++ b/zen.sln @@ -0,0 +1,93 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.28315.86 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{4EA55E5B-18A1-4E66-B821-44575BC11EA7}" + ProjectSection(SolutionItems) = preProject + .gitignore = .gitignore + .gitmodules = .gitmodules + CODING.md = CODING.md + README.md = README.md + RESTAPI.md = RESTAPI.md + vcpkg.json = vcpkg.json + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zencore", "zencore\zencore.vcxproj", "{D75BF9AB-C61E-4FFF-AD59-1563430F05E2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zencore-test", "zencore-test\zencore-test.vcxproj", "{C00173DF-B76E-4989-B576-FE2B780B2580}" + ProjectSection(ProjectDependencies) = postProject + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} = {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zenserver", "zenserver\zenserver.vcxproj", "{8398D81C-B1B6-4327-82B1-06EACB8A144F}" + ProjectSection(ProjectDependencies) = postProject + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} = {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} + {26CBBAEB-14C1-4EFC-877D-80F48215651C} = {26CBBAEB-14C1-4EFC-877D-80F48215651C} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zen", "zen\zen.vcxproj", "{CA7B9E04-A2D3-4A39-A7D7-FB156A2C6A48}" + ProjectSection(ProjectDependencies) = postProject + {8398D81C-B1B6-4327-82B1-06EACB8A144F} = {8398D81C-B1B6-4327-82B1-06EACB8A144F} + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} = {D75BF9AB-C61E-4FFF-AD59-1563430F05E2} + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rdparty", "3rdparty", "{3CB3B9E8-B4CB-4D2E-821A-2AFE34093BEF}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zenserver-test", "zenserver-test\zenserver-test.vcxproj", "{2563249E-E695-4CC4-8FFA-335D07680C9D}" + ProjectSection(ProjectDependencies) = postProject + {8398D81C-B1B6-4327-82B1-06EACB8A144F} = {8398D81C-B1B6-4327-82B1-06EACB8A144F} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zenstore", "zenstore\zenstore.vcxproj", "{26CBBAEB-14C1-4EFC-877D-80F48215651C}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zentestutil", "zentestutil\zentestutil.vcxproj", "{77F8315D-B21D-4DB0-9A6F-2D3359F88A70}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zentest-appstub", "zentest-appstub\zentest-appstub.vcxproj", "{7FFC7E77-D038-44E9-8D84-41918C355F29}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2}.Debug|x64.ActiveCfg = Debug|x64 + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2}.Debug|x64.Build.0 = Debug|x64 + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2}.Release|x64.ActiveCfg = Release|x64 + {D75BF9AB-C61E-4FFF-AD59-1563430F05E2}.Release|x64.Build.0 = Release|x64 + {C00173DF-B76E-4989-B576-FE2B780B2580}.Debug|x64.ActiveCfg = Debug|x64 + {C00173DF-B76E-4989-B576-FE2B780B2580}.Debug|x64.Build.0 = Debug|x64 + {C00173DF-B76E-4989-B576-FE2B780B2580}.Release|x64.ActiveCfg = Release|x64 + {C00173DF-B76E-4989-B576-FE2B780B2580}.Release|x64.Build.0 = Release|x64 + {8398D81C-B1B6-4327-82B1-06EACB8A144F}.Debug|x64.ActiveCfg = Debug|x64 + {8398D81C-B1B6-4327-82B1-06EACB8A144F}.Debug|x64.Build.0 = Debug|x64 + {8398D81C-B1B6-4327-82B1-06EACB8A144F}.Release|x64.ActiveCfg = Release|x64 + {8398D81C-B1B6-4327-82B1-06EACB8A144F}.Release|x64.Build.0 = Release|x64 + {CA7B9E04-A2D3-4A39-A7D7-FB156A2C6A48}.Debug|x64.ActiveCfg = Debug|x64 + {CA7B9E04-A2D3-4A39-A7D7-FB156A2C6A48}.Debug|x64.Build.0 = Debug|x64 + {CA7B9E04-A2D3-4A39-A7D7-FB156A2C6A48}.Release|x64.ActiveCfg = Release|x64 + {CA7B9E04-A2D3-4A39-A7D7-FB156A2C6A48}.Release|x64.Build.0 = Release|x64 + {2563249E-E695-4CC4-8FFA-335D07680C9D}.Debug|x64.ActiveCfg = Debug|x64 + {2563249E-E695-4CC4-8FFA-335D07680C9D}.Debug|x64.Build.0 = Debug|x64 + {2563249E-E695-4CC4-8FFA-335D07680C9D}.Release|x64.ActiveCfg = Release|x64 + {2563249E-E695-4CC4-8FFA-335D07680C9D}.Release|x64.Build.0 = Release|x64 + {26CBBAEB-14C1-4EFC-877D-80F48215651C}.Debug|x64.ActiveCfg = Debug|x64 + {26CBBAEB-14C1-4EFC-877D-80F48215651C}.Debug|x64.Build.0 = Debug|x64 + {26CBBAEB-14C1-4EFC-877D-80F48215651C}.Release|x64.ActiveCfg = Release|x64 + {26CBBAEB-14C1-4EFC-877D-80F48215651C}.Release|x64.Build.0 = Release|x64 + {77F8315D-B21D-4DB0-9A6F-2D3359F88A70}.Debug|x64.ActiveCfg = Debug|x64 + {77F8315D-B21D-4DB0-9A6F-2D3359F88A70}.Debug|x64.Build.0 = Debug|x64 + {77F8315D-B21D-4DB0-9A6F-2D3359F88A70}.Release|x64.ActiveCfg = Release|x64 + {77F8315D-B21D-4DB0-9A6F-2D3359F88A70}.Release|x64.Build.0 = Release|x64 + {7FFC7E77-D038-44E9-8D84-41918C355F29}.Debug|x64.ActiveCfg = Debug|x64 + {7FFC7E77-D038-44E9-8D84-41918C355F29}.Debug|x64.Build.0 = Debug|x64 + {7FFC7E77-D038-44E9-8D84-41918C355F29}.Release|x64.ActiveCfg = Release|x64 + {7FFC7E77-D038-44E9-8D84-41918C355F29}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {6E9AFFA0-9B14-4264-B7D1-B61F395A0B5F} + EndGlobalSection +EndGlobal diff --git a/zen/chunk/chunk.cpp b/zen/chunk/chunk.cpp new file mode 100644 index 000000000..0a7efd56a --- /dev/null +++ b/zen/chunk/chunk.cpp @@ -0,0 +1,1157 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "chunk.h" +#include <doctest/doctest.h> + +#include <gsl/gsl-lite.hpp> + +#include <zencore/filesystem.h> +#include <zencore/iohash.h> +#include <zencore/refcount.h> +#include <zencore/scopeguard.h> +#include <zencore/sha1.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> +#include <zenstore/cas.h> + +#include "../internalfile.h" + +#include <lz4.h> +#include <spdlog/spdlog.h> +#include <zstd.h> + +#include <ppl.h> +#include <ppltasks.h> + +#include <cmath> +#include <filesystem> +#include <random> +#include <vector> + +////////////////////////////////////////////////////////////////////////// + +namespace detail { +static const uint32_t buzhashTable[] = { + 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, 0x7ebf5191, 0x841135c7, 0x65cc53b3, + 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, 0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa, 0x882bf287, 0x3116737c, + 0x05569956, 0xe8cc1f68, 0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7, 0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93, 0x9bfd7c64, + 0x0b3e7276, 0xf2688e77, 0x8fad8abc, 0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00, 0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2, + 0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2, 0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e, 0x7b7c222f, 0x2955ed16, 0x9f10ca59, + 0xe840c4c9, 0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c, 0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5, 0xb19165cd, 0x9891c393, + 0x325384ac, 0x0308459d, 0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed, 0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae, 0x977eb18c, + 0xd8770976, 0x9833466a, 0xc674df7f, 0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874, 0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4, + 0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493, 0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85, 0xbef8f0e1, 0x21d73653, 0x4e3d977a, + 0x1e7b3929, 0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6, 0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617, 0xf5f7be70, 0xe795248a, + 0x375a2fe9, 0x425570b6, 0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f, 0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f, 0x1bc0dfb5, + 0xfb273589, 0x0ea47f7b, 0x3c1c2b50, 0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2, 0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c, + 0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b, 0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729, 0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, + 0xe0d8f8ae, 0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328, 0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f, 0x686a5b83, 0x50e072e5, + 0xd9d3bb2a, 0x8befc475, 0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18, 0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09, 0xc0d0a81c, + 0x7fa3429b, 0xe9158a1b, 0x225ea19a, 0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293, 0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140, + 0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661, 0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9, 0x19727a23, 0x15a7e374, 0xc43a18d5, + 0x3fb1aa73, 0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9, 0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2, 0x5388e5ee, 0xcd8a7510, + 0xf901b4fd, 0xdbc13dbc, 0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49, 0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b, 0x32baf4a9, + 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a, 0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c, 0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1, + 0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319, 0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1, 0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, + 0xc6eb57bb, 0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c, 0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b, 0x329e5388, 0x91dd236b, + 0x2ecb0d93, 0xf4d82a3d, 0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964, 0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131, 0x7bf7cabc, + 0xf9c18d66, 0x593ade65, 0xd95ddf11, +}; + +// ROL operation (compiler turns this into a ROL when optimizing) +static inline uint32_t +Rotate32(uint32_t Value, size_t RotateCount) +{ + RotateCount &= 31; + + return ((Value) << (RotateCount)) | ((Value) >> (32 - RotateCount)); +} +} // namespace detail + +////////////////////////////////////////////////////////////////////////// + +class ZenChunker +{ +public: + void SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize); + size_t ScanChunk(const void* DataBytes, size_t ByteCount); + void Reset(); + + // This controls which chunking approach is used - threshold or + // modulo based. Threshold is faster and generates similarly sized + // chunks + void SetUseThreshold(bool NewState) { m_useThreshold = NewState; } + + inline size_t ChunkSizeMin() const { return m_chunkSizeMin; } + inline size_t ChunkSizeMax() const { return m_chunkSizeMax; } + inline size_t ChunkSizeAvg() const { return m_chunkSizeAvg; } + inline uint64_t BytesScanned() const { return m_bytesScanned; } + + static constexpr size_t NoBoundaryFound = size_t(~0ull); + +private: + size_t m_chunkSizeMin = 0; + size_t m_chunkSizeMax = 0; + size_t m_chunkSizeAvg = 0; + + uint32_t m_discriminator = 0; // Computed in SetChunkSize() + uint32_t m_threshold = 0; // Computed in SetChunkSize() + + bool m_useThreshold = true; + + static constexpr size_t kChunkSizeLimitMax = 64 * 1024 * 1024; + static constexpr size_t kChunkSizeLimitMin = 1024; + + static constexpr size_t kDefaultAverageChunkSize = 64 * 1024; + + static constexpr int kWindowSize = 48; + uint8_t m_window[kWindowSize]; + uint32_t m_windowSize = 0; + + uint32_t m_currentHash = 0; + uint32_t m_currentChunkSize = 0; + + uint64_t m_bytesScanned = 0; + + size_t InternalScanChunk(const void* DataBytes, size_t ByteCount); + void InternalReset(); +}; + +void +ZenChunker::Reset() +{ + InternalReset(); + + m_bytesScanned = 0; +} + +void +ZenChunker::InternalReset() +{ + m_currentHash = 0; + m_currentChunkSize = 0; + m_windowSize = 0; +} + +void +ZenChunker::SetChunkSize(size_t MinSize, size_t MaxSize, size_t AvgSize) +{ + if (m_windowSize) + return; // Already started + + static_assert(kChunkSizeLimitMin > kWindowSize); + + if (AvgSize) + { + // TODO: Validate AvgSize range + } + else + { + if (MinSize && MaxSize) + { + AvgSize = lrint(pow(2, (log2(MinSize) + log2(MaxSize)) / 2)); + } + else if (MinSize) + { + AvgSize = MinSize * 4; + } + else if (MaxSize) + { + AvgSize = MaxSize / 4; + } + else + { + AvgSize = kDefaultAverageChunkSize; + } + } + + if (MinSize) + { + // TODO: Validate MinSize range + } + else + { + MinSize = std::max(AvgSize / 4, kChunkSizeLimitMin); + } + + if (MaxSize) + { + // TODO: Validate MaxSize range + } + else + { + MaxSize = std::min(AvgSize * 4, kChunkSizeLimitMax); + } + + m_discriminator = gsl::narrow<uint32_t>(AvgSize - MinSize); + + if (m_discriminator < MinSize) + { + m_discriminator = gsl::narrow<uint32_t>(MinSize); + } + + if (m_discriminator > MaxSize) + { + m_discriminator = gsl::narrow<uint32_t>(MaxSize); + } + + m_threshold = gsl::narrow<uint32_t>((uint64_t(std::numeric_limits<uint32_t>::max()) + 1) / m_discriminator); + + m_chunkSizeMin = MinSize; + m_chunkSizeMax = MaxSize; + m_chunkSizeAvg = AvgSize; +} + +size_t +ZenChunker::ScanChunk(const void* DataBytesIn, size_t ByteCount) +{ + size_t Result = InternalScanChunk(DataBytesIn, ByteCount); + + if (Result == NoBoundaryFound) + { + m_bytesScanned += ByteCount; + } + else + { + m_bytesScanned += Result; + } + + return Result; +} + +size_t +ZenChunker::InternalScanChunk(const void* DataBytesIn, size_t ByteCount) +{ + size_t CurrentOffset = 0; + const uint8_t* CursorPtr = reinterpret_cast<const uint8_t*>(DataBytesIn); + + // There's no point in updating the hash if we know we're not + // going to have a cut point, so just skip the data. This logic currently + // provides roughly a 20% speedup on my machine + + const size_t NeedHashOffset = m_chunkSizeMin - kWindowSize; + + if (m_currentChunkSize < NeedHashOffset) + { + const uint32_t SkipBytes = gsl::narrow<uint32_t>(std::min<uint64_t>(ByteCount, NeedHashOffset - m_currentChunkSize)); + + ByteCount -= SkipBytes; + m_currentChunkSize += SkipBytes; + CurrentOffset += SkipBytes; + CursorPtr += SkipBytes; + + m_windowSize = 0; + + if (ByteCount == 0) + { + return NoBoundaryFound; + } + } + + // Fill window first + + if (m_windowSize < kWindowSize) + { + const uint32_t FillBytes = uint32_t(std::min<size_t>(ByteCount, kWindowSize - m_windowSize)); + + memcpy(&m_window[m_windowSize], CursorPtr, FillBytes); + + CursorPtr += FillBytes; + + m_windowSize += FillBytes; + m_currentChunkSize += FillBytes; + + CurrentOffset += FillBytes; + ByteCount -= FillBytes; + + if (m_windowSize < kWindowSize) + { + return NoBoundaryFound; + } + + // We have a full window, initialize hash + + uint32_t CurrentHash = 0; + + for (int i = 1; i < kWindowSize; ++i) + { + CurrentHash ^= detail::Rotate32(detail::buzhashTable[m_window[i - 1]], kWindowSize - i); + } + + m_currentHash = CurrentHash ^ detail::buzhashTable[m_window[kWindowSize - 1]]; + } + + // Scan for boundaries (i.e points where the hash matches the value determined by + // the discriminator) + + uint32_t CurrentHash = m_currentHash; + uint32_t CurrentChunkSize = m_currentChunkSize; + + size_t Index = CurrentChunkSize % kWindowSize; + + if (m_threshold && m_useThreshold) + { + // This is roughly 4x faster than the general modulo approach on my + // TR 3990X (~940MB/sec) and doesn't require any special parameters to + // achieve max performance + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::buzhashTable[OldByte], m_windowSize) ^ + detail::buzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_chunkSizeMin) + { + bool foundBreak; + + if (CurrentChunkSize >= m_chunkSizeMax) + { + foundBreak = true; + } + else + { + foundBreak = CurrentHash <= m_threshold; + } + + if (foundBreak) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + else if ((m_discriminator & (m_discriminator - 1)) == 0) + { + // This is quite a bit faster than the generic modulo path, but + // requires a very specific average chunk size to be used. If you + // pass in an even power-of-two divided by 0.75 as the average + // chunk size you'll hit this path + + const uint32_t Mask = m_discriminator - 1; + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::buzhashTable[OldByte], m_windowSize) ^ + detail::buzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_chunkSizeMin) + { + bool foundBreak; + + if (CurrentChunkSize >= m_chunkSizeMax) + { + foundBreak = true; + } + else + { + foundBreak = (CurrentHash & Mask) == Mask; + } + + if (foundBreak) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + else + { + // This is the slowest path, which caps out around 250MB/sec for large sizes + // on my TR3900X + + while (ByteCount) + { + const uint8_t NewByte = *CursorPtr; + const uint8_t OldByte = m_window[Index]; + + CurrentHash = detail::Rotate32(CurrentHash, 1) ^ detail::Rotate32(detail::buzhashTable[OldByte], m_windowSize) ^ + detail::buzhashTable[NewByte]; + + CurrentChunkSize++; + CurrentOffset++; + + if (CurrentChunkSize >= m_chunkSizeMin) + { + bool foundBreak; + + if (CurrentChunkSize >= m_chunkSizeMax) + { + foundBreak = true; + } + else + { + foundBreak = (CurrentHash % m_discriminator) == (m_discriminator - 1); + } + + if (foundBreak) + { + // Boundary found! + InternalReset(); + + return CurrentOffset; + } + } + + m_window[Index++] = *CursorPtr; + + if (Index == kWindowSize) + { + Index = 0; + } + + ++CursorPtr; + --ByteCount; + } + } + + m_currentChunkSize = CurrentChunkSize; + m_currentHash = CurrentHash; + + return NoBoundaryFound; +} + +////////////////////////////////////////////////////////////////////////// + +class DirectoryScanner +{ +public: + struct FileEntry + { + std::filesystem::path Path; + uint64_t FileSize; + }; + + const std::vector<FileEntry>& Files() { return m_Files; } + std::vector<FileEntry>&& TakeFiles() { return std::move(m_Files); } + uint64_t FileBytes() const { return m_FileBytes; } + + void Scan(std::filesystem::path RootPath) + { + for (const std::filesystem::directory_entry& Entry : std::filesystem::recursive_directory_iterator(RootPath)) + { + if (Entry.is_regular_file()) + { + m_Files.push_back({Entry.path(), Entry.file_size()}); + m_FileBytes += Entry.file_size(); + } + } + } + +private: + std::vector<FileEntry> m_Files; + uint64_t m_FileBytes = 0; +}; + +////////////////////////////////////////////////////////////////////////// + +class BaseChunker +{ +public: + void SetCasStore(zen::CasStore* CasStore) { m_CasStore = CasStore; } + + struct StatsBlock + { + uint64_t TotalBytes = 0; + uint64_t TotalChunks = 0; + uint64_t TotalCompressed = 0; + uint64_t UniqueBytes = 0; + uint64_t UniqueChunks = 0; + uint64_t UniqueCompressed = 0; + uint64_t DuplicateBytes = 0; + uint64_t NewCasChunks = 0; + uint64_t NewCasBytes = 0; + + StatsBlock& operator+=(const StatsBlock& Rhs) + { + TotalBytes += Rhs.TotalBytes; + TotalChunks += Rhs.TotalChunks; + TotalCompressed += Rhs.TotalCompressed; + UniqueBytes += Rhs.UniqueBytes; + UniqueChunks += Rhs.UniqueChunks; + UniqueCompressed += Rhs.UniqueCompressed; + DuplicateBytes += Rhs.DuplicateBytes; + NewCasChunks += Rhs.NewCasChunks; + NewCasBytes += Rhs.NewCasBytes; + return *this; + } + }; + +protected: + Concurrency::combinable<StatsBlock> m_StatsBlock; + +public: + StatsBlock SumStats() + { + StatsBlock _; + m_StatsBlock.combine_each([&](const StatsBlock& Block) { _ += Block; }); + return _; + } + +protected: + struct HashSet + { + bool Add(const zen::IoHash& Hash) + { + const uint8_t ShardNo = Hash.Hash[19]; + + Bucket& Shard = m_Buckets[ShardNo]; + + zen::RwLock::ExclusiveLockScope _(Shard.HashLock); + + auto rv = Shard.Hashes.insert(Hash); + + return rv.second; + } + + private: + struct alignas(64) Bucket + { + zen::RwLock HashLock; + std::unordered_set<zen::IoHash, zen::IoHash::Hasher> Hashes; +#pragma warning(suppress : 4324) // Padding due to alignment + }; + + Bucket m_Buckets[256]; + }; + + zen::CasStore* m_CasStore = nullptr; +}; + +class FixedBlockSizeChunker : public BaseChunker +{ +public: + FixedBlockSizeChunker(std::filesystem::path InRootPath) : m_RootPath(InRootPath) {} + ~FixedBlockSizeChunker() = default; + + void SetChunkSize(uint64_t ChunkSize) + { + /* TODO: verify validity of chunk size */ + m_ChunkSize = ChunkSize; + } + void SetUseCompression(bool UseCompression) { m_UseCompression = UseCompression; } + void SetPerformValidation(bool PerformValidation) { m_PerformValidation = PerformValidation; } + + void InitCompression() + { + if (!m_CompressionBufferManager) + { + std::call_once(m_CompressionInitFlag, [&] { + // Wasteful, but should only be temporary + m_CompressionBufferManager.reset(new FileBufferManager(m_ChunkSize * 2, 128)); + }); + } + } + + void ChunkFile(const DirectoryScanner::FileEntry& File) + { + InitCompression(); + + std::filesystem::path RelativePath{std::filesystem::relative(File.Path.generic_string(), m_RootPath)}; + + Concurrency::task_group ChunkProcessTasks; + + spdlog::info("Chunking {} ({})", RelativePath.generic_string(), zen::NiceBytes(File.FileSize)); + + zen::RefPtr<InternalFile> Zfile = new InternalFile; + Zfile->OpenRead(File.Path); + + size_t FileBytes = Zfile->GetFileSize(); + uint64_t CurrentFileOffset = 0; + + std::vector<zen::IoHash> BlockHashes{(FileBytes + m_ChunkSize - 1) / m_ChunkSize}; + + while (FileBytes) + { + zen::IoBuffer Buffer = m_BufferManager.AllocBuffer(); + + const size_t BytesToRead = std::min(FileBytes, Buffer.Size()); + + Zfile->Read((void*)Buffer.Data(), BytesToRead, CurrentFileOffset); + + auto ProcessChunk = [this, Buffer, &BlockHashes, CurrentFileOffset, BytesToRead] { + StatsBlock& Stats = m_StatsBlock.local(); + for (uint64_t Offset = 0; Offset < BytesToRead; Offset += m_ChunkSize) + { + const uint8_t* DataPointer = reinterpret_cast<const uint8_t*>(Buffer.Data()) + Offset; + const uint64_t DataSize = std::min(BytesToRead - Offset, m_ChunkSize); + const zen::IoHash Hash = zen::IoHash::HashMemory(DataPointer, DataSize); + + BlockHashes[(CurrentFileOffset + Offset) / m_ChunkSize] = Hash; + + const bool IsNew = m_LocalHashSet.Add(Hash); + + if (IsNew) + { + if (m_UseCompression) + { + if (true) + { + // Compress using ZSTD + const size_t CompressBufferSize = ZSTD_compressBound(DataSize); + + zen::IoBuffer CompressedBuffer = m_CompressionBufferManager->AllocBuffer(); + char* CompressBuffer = (char*)CompressedBuffer.Data(); + + ZEN_ASSERT(CompressedBuffer.Size() >= CompressBufferSize); + + const size_t CompressedSize = ZSTD_compress(CompressBuffer, + CompressBufferSize, + (const char*)DataPointer, + DataSize, + ZSTD_CLEVEL_DEFAULT); + + Stats.UniqueCompressed += CompressedSize; + + if (m_CasStore) + { + const zen::IoHash CompressedHash = zen::IoHash::HashMemory(CompressBuffer, CompressedSize); + zen::CasStore::InsertResult Result = + m_CasStore->InsertChunk(CompressBuffer, CompressedSize, CompressedHash); + + if (Result.New) + { + Stats.NewCasChunks += 1; + Stats.NewCasBytes += CompressedSize; + } + } + + m_CompressionBufferManager->ReturnBuffer(CompressedBuffer); + } + else + { + // Compress using LZ4 + const int CompressBufferSize = LZ4_compressBound(gsl::narrow<int>(DataSize)); + + zen::IoBuffer CompressedBuffer = m_CompressionBufferManager->AllocBuffer(); + char* CompressBuffer = (char*)CompressedBuffer.Data(); + + ZEN_ASSERT(CompressedBuffer.Size() >= CompressBufferSize); + + const int CompressedSize = LZ4_compress_default((const char*)DataPointer, + CompressBuffer, + gsl::narrow<int>(DataSize), + CompressBufferSize); + + Stats.UniqueCompressed += CompressedSize; + + if (m_CasStore) + { + const zen::IoHash CompressedHash = zen::IoHash::HashMemory(CompressBuffer, CompressedSize); + zen::CasStore::InsertResult Result = + m_CasStore->InsertChunk(CompressBuffer, CompressedSize, CompressedHash); + + if (Result.New) + { + Stats.NewCasChunks += 1; + Stats.NewCasBytes += CompressedSize; + } + } + + m_CompressionBufferManager->ReturnBuffer(CompressedBuffer); + } + } + else if (m_CasStore) + { + zen::CasStore::InsertResult Result = m_CasStore->InsertChunk(zen::IoBuffer(Buffer, Offset, DataSize), Hash); + + if (Result.New) + { + Stats.NewCasChunks += 1; + Stats.NewCasBytes += DataSize; + } + } + + Stats.UniqueBytes += DataSize; + Stats.UniqueChunks += 1; + } + else + { + // We've seen this chunk before + Stats.DuplicateBytes += DataSize; + } + + Stats.TotalBytes += DataSize; + Stats.TotalChunks += 1; + } + + m_BufferManager.ReturnBuffer(Buffer); + }; + + ChunkProcessTasks.run(ProcessChunk); + + CurrentFileOffset += BytesToRead; + FileBytes -= BytesToRead; + } + + ChunkProcessTasks.wait(); + + // Verify pass + + if (!m_UseCompression && m_PerformValidation) + { + const uint8_t* FileData = reinterpret_cast<const uint8_t*>(Zfile->MemoryMapFile()); + uint64_t Offset = 0; + const uint64_t BytesToRead = Zfile->GetFileSize(); + + for (zen::IoHash& Hash : BlockHashes) + { + const uint64_t DataSize = std::min(BytesToRead - Offset, m_ChunkSize); + const zen::IoHash CalcHash = zen::IoHash::HashMemory(FileData + Offset, DataSize); + + ZEN_ASSERT(CalcHash == Hash); + + zen::IoBuffer FoundValue = m_CasStore->FindChunk(CalcHash); + + ZEN_ASSERT(FoundValue); + ZEN_ASSERT(FoundValue.Size() == DataSize); + + Offset += DataSize; + } + } + } + +private: + std::filesystem::path m_RootPath; + FileBufferManager m_BufferManager{128 * 1024, 128}; + uint64_t m_ChunkSize = 64 * 1024; + HashSet m_LocalHashSet; + bool m_UseCompression = true; + bool m_PerformValidation = false; + + std::once_flag m_CompressionInitFlag; + std::unique_ptr<FileBufferManager> m_CompressionBufferManager; +}; + +class VariableBlockSizeChunker : public BaseChunker +{ +public: + VariableBlockSizeChunker(std::filesystem::path InRootPath) : m_RootPath(InRootPath) {} + + void SetAverageChunkSize(uint64_t AverageChunkSize) { m_AverageChunkSize = AverageChunkSize; } + void SetUseCompression(bool UseCompression) { m_UseCompression = UseCompression; } + + void ChunkFile(const DirectoryScanner::FileEntry& File) + { + std::filesystem::path RelativePath{std::filesystem::relative(File.Path.generic_string(), m_RootPath)}; + + spdlog::info("Chunking {} ({})", RelativePath.generic_string(), zen::NiceBytes(File.FileSize)); + + zen::RefPtr<InternalFile> Zfile = new InternalFile; + Zfile->OpenRead(File.Path); + + // Could use IoBuffer here to help manage lifetimes of things + // across tasks / threads + + ZenChunker Chunker; + Chunker.SetChunkSize(0, 0, m_AverageChunkSize); + + const size_t DataSize = Zfile->GetFileSize(); + + std::vector<size_t> Boundaries; + + uint64_t CurrentStreamPosition = 0; + uint64_t CurrentChunkSize = 0; + size_t RemainBytes = DataSize; + + Concurrency::structured_task_group CompressionTasks; + + zen::IoHashStream IoHashStream; + + while (RemainBytes != 0) + { + zen::IoBuffer Buffer = m_BufferManager.AllocBuffer(); + + size_t BytesToRead = std::min(RemainBytes, Buffer.Size()); + + uint8_t* DataPointer = (uint8_t*)Buffer.Data(); + + Zfile->Read(DataPointer, BytesToRead, CurrentStreamPosition); + + StatsBlock& Stats = m_StatsBlock.local(); + + while (BytesToRead) + { + const size_t Boundary = Chunker.ScanChunk(DataPointer, BytesToRead); + + if (Boundary == ZenChunker::NoBoundaryFound) + { + IoHashStream.Append(DataPointer, BytesToRead); + CurrentStreamPosition += BytesToRead; + CurrentChunkSize += BytesToRead; + RemainBytes -= BytesToRead; + break; + } + + // Boundary found + + IoHashStream.Append(DataPointer, Boundary); + + const zen::IoHash Hash = IoHashStream.GetHash(); + const bool IsNew = m_LocalHashSet.Add(Hash); + + CurrentStreamPosition += Boundary; + CurrentChunkSize += Boundary; + Boundaries.push_back(CurrentStreamPosition); + + if (IsNew) + { + Stats.UniqueBytes += CurrentChunkSize; + } + else + { + // We've seen this chunk before + Stats.DuplicateBytes += CurrentChunkSize; + } + + DataPointer += Boundary; + RemainBytes -= Boundary; + BytesToRead -= Boundary; + CurrentChunkSize = 0; + IoHashStream.Reset(); + } + + m_BufferManager.ReturnBuffer(Buffer); + +#if 0 + Active.AddCount(); // needs fixing + + Concurrency::create_task([this, Zfile, CurrentPosition, DataPointer, &Active] { + const zen::IoHash Hash = zen::IoHash::HashMemory(DataPointer, CurrentPosition); + + const bool isNew = m_LocalHashSet.Add(Hash); + + const int CompressBufferSize = LZ4_compressBound(gsl::narrow<int>(CurrentPosition)); + char* CompressBuffer = (char*)_aligned_malloc(CompressBufferSize, 16); + + const int CompressedSize = + LZ4_compress_default((const char*)DataPointer, CompressBuffer, gsl::narrow<int>(CurrentPosition), CompressBufferSize); + + m_TotalCompressed.local() += CompressedSize; + + if (isNew) + { + m_UniqueBytes.local() += CurrentPosition; + m_UniqueCompressed.local() += CompressedSize; + + if (m_CasStore) + { + const zen::IoHash CompressedHash = zen::IoHash::HashMemory(CompressBuffer, CompressedSize); + m_CasStore->InsertChunk(CompressBuffer, CompressedSize, CompressedHash); + } + } + + Active.Signal(); // needs fixing + + _aligned_free(CompressBuffer); + }); +#endif + } + + StatsBlock& Stats = m_StatsBlock.local(); + Stats.TotalBytes += DataSize; + Stats.TotalChunks += Boundaries.size() + 1; + + // TODO: Wait for all compression tasks + + auto ChunkCount = Boundaries.size() + 1; + + spdlog::info("Split {} ({}) into {} chunks, avg size {}", + RelativePath.generic_string(), + zen::NiceBytes(File.FileSize), + ChunkCount, + File.FileSize / ChunkCount); + }; + +private: + HashSet m_LocalHashSet; + std::filesystem::path m_RootPath; + uint64_t m_AverageChunkSize = 32 * 1024; + bool m_UseCompression = true; + FileBufferManager m_BufferManager{128 * 1024, 128}; +}; + +////////////////////////////////////////////////////////////////////////// + +ChunkCommand::ChunkCommand() +{ + m_Options.add_options()("r,root", "Root directory for CAS pool", cxxopts::value(m_RootDirectory)); + m_Options.add_options()("d,dir", "Directory to scan", cxxopts::value(m_ScanDirectory)); + m_Options.add_options()("c,chunk-size", "Use fixed chunk size", cxxopts::value(m_ChunkSize)); + m_Options.add_options()("a,average-chunk-size", "Use dynamic chunk size", cxxopts::value(m_AverageChunkSize)); + m_Options.add_options()("compress", "Apply compression to chunks", cxxopts::value(m_UseCompression)); +} + +ChunkCommand::~ChunkCommand() = default; + +int +ChunkCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + auto result = m_Options.parse(argc, argv); + + bool IsValid = m_ScanDirectory.length(); + + if (!IsValid) + throw cxxopts::OptionParseException("Chunk command requires a directory to scan"); + + if ((m_ChunkSize && m_AverageChunkSize) && (!m_ChunkSize && !m_AverageChunkSize)) + throw cxxopts::OptionParseException("Either of --chunk-size or --average-chunk-size must be used"); + + std::unique_ptr<zen::CasStore> CasStore; + + if (!m_RootDirectory.empty()) + { + zen::CasStoreConfiguration Config; + Config.RootDirectory = m_RootDirectory; + + CasStore.reset(zen::CreateCasStore()); + CasStore->Initialize(Config); + } + + // Gather list of files to process + + spdlog::info("Gathering files from {}", m_ScanDirectory); + + std::filesystem::path RootPath{m_ScanDirectory}; + DirectoryScanner Scanner; + Scanner.Scan(RootPath); + + auto Files = Scanner.TakeFiles(); + uint64_t FileBytes = Scanner.FileBytes(); + + std::sort(begin(Files), end(Files), [](const DirectoryScanner::FileEntry& Lhs, const DirectoryScanner::FileEntry& Rhs) { + return Lhs.FileSize < Rhs.FileSize; + }); + + spdlog::info("Gathered {} files, total size {}", Files.size(), zen::NiceBytes(FileBytes)); + + auto ReportSummary = [&](BaseChunker& Chunker, uint64_t ElapsedMs) { + const BaseChunker::StatsBlock& Stats = Chunker.SumStats(); + + const size_t TotalChunkCount = Stats.TotalChunks; + spdlog::info("Scanned {} files in {}, generated {} chunks", Files.size(), zen::NiceTimeSpanMs(ElapsedMs), TotalChunkCount); + + const size_t TotalByteCount = Stats.TotalBytes; + const size_t TotalCompressedBytes = Stats.TotalCompressed; + + spdlog::info("Total bytes {} ({}), compresses into {}", + zen::NiceBytes(TotalByteCount), + zen::NiceByteRate(TotalByteCount, ElapsedMs), + zen::NiceBytes(TotalCompressedBytes)); + + const size_t TotalUniqueBytes = Stats.UniqueBytes; + const size_t TotalUniqueCompressedBytes = Stats.UniqueCompressed; + const size_t TotalDuplicateBytes = Stats.DuplicateBytes; + + spdlog::info("Chunksize average {}, unique bytes = {} (compressed {}), dup bytes = {}", + TotalByteCount / TotalChunkCount, + zen::NiceBytes(TotalUniqueBytes), + zen::NiceBytes(TotalUniqueCompressedBytes), + zen::NiceBytes(TotalDuplicateBytes)); + + spdlog::info("New to CAS: {} chunks, {}", Stats.NewCasChunks, zen::NiceBytes(Stats.NewCasBytes)); + }; + + // Process them as quickly as possible + + if (m_AverageChunkSize) + { + VariableBlockSizeChunker Chunker{RootPath}; + Chunker.SetAverageChunkSize(m_AverageChunkSize); + Chunker.SetUseCompression(m_UseCompression); + Chunker.SetCasStore(CasStore.get()); + + zen::Stopwatch timer; + +#if 1 + Concurrency::parallel_for_each(begin(Files), end(Files), [&Chunker](const auto& ThisFile) { Chunker.ChunkFile(ThisFile); }); +#else + for (const auto& ThisFile : Files) + { + Chunker.ChunkFile(ThisFile); + } +#endif + + uint64_t ElapsedMs = timer.getElapsedTimeMs(); + + ReportSummary(Chunker, ElapsedMs); + } + else if (m_ChunkSize) + { + FixedBlockSizeChunker Chunker{RootPath}; + Chunker.SetChunkSize(m_ChunkSize); + Chunker.SetUseCompression(m_UseCompression); + Chunker.SetCasStore(CasStore.get()); + + zen::Stopwatch timer; + + Concurrency::parallel_for_each(begin(Files), end(Files), [&Chunker](const DirectoryScanner::FileEntry& ThisFile) { + try + { + Chunker.ChunkFile(ThisFile); + } + catch (std::exception& ex) + { + zen::ExtendableStringBuilder<256> Path8; + zen::WideToUtf8(ThisFile.Path.c_str(), Path8); + spdlog::warn("Caught exception while chunking '{}': {}", Path8, ex.what()); + } + }); + + uint64_t ElapsedMs = timer.getElapsedTimeMs(); + + ReportSummary(Chunker, ElapsedMs); + } + else + { + ZEN_ASSERT(false); + } + + // TODO: implement snapshot enumeration and display + return 0; +} + +////////////////////////////////////////////////////////////////////////// + +TEST_CASE("chunking") +{ + using namespace zen; + + auto test = [](bool UseThreshold, bool Random, int MinBlockSize, int MaxBlockSize) { + std::mt19937_64 mt; + + std::vector<uint64_t> bytes; + bytes.resize(1 * 1024 * 1024); + + if (Random == false) + { + // Generate a single block of randomness + for (auto& w : bytes) + { + w = mt(); + } + } + + for (int i = MinBlockSize; i <= MaxBlockSize; i <<= 1) + { + Stopwatch timer; + + ZenChunker chunker; + chunker.SetUseThreshold(UseThreshold); + chunker.SetChunkSize(0, 0, i); + // chunker.SetChunkSize(i / 4, i * 4, 0); + // chunker.SetChunkSize(i / 8, i * 8, 0); + // chunker.SetChunkSize(i / 16, i * 16, 0); + // chunker.SetChunkSize(0, 0, size_t(i / 0.75)); // Hits the fast modulo path + + std::vector<size_t> boundaries; + + size_t CurrentPosition = 0; + int BoundaryCount = 0; + + do + { + if (Random == true) + { + // Generate a new block of randomness for each pass + for (auto& w : bytes) + { + w = mt(); + } + } + + const uint8_t* Ptr = reinterpret_cast<const uint8_t*>(bytes.data()); + size_t BytesRemain = bytes.size() * sizeof(uint64_t); + + for (;;) + { + const size_t Boundary = chunker.ScanChunk(Ptr, BytesRemain); + + if (Boundary == ZenChunker::NoBoundaryFound) + { + CurrentPosition += BytesRemain; + break; + } + + // Boundary found + + CurrentPosition += Boundary; + + CHECK(CurrentPosition >= chunker.ChunkSizeMin()); + CHECK(CurrentPosition <= chunker.ChunkSizeMax()); + + boundaries.push_back(CurrentPosition); + + CurrentPosition = 0; + Ptr += Boundary; + BytesRemain -= Boundary; + + ++BoundaryCount; + } + } while (BoundaryCount < 5000); + + size_t BoundarySum = 0; + + for (const auto& v : boundaries) + { + BoundarySum += v; + } + + double Avg = double(BoundarySum) / BoundaryCount; + const uint64_t ElapsedTimeMs = timer.getElapsedTimeMs(); + + spdlog::info("{:9} : Avg {:9} - {:2.5} ({:6}, {})", + i, + Avg, + double(i / Avg), + NiceTimeSpanMs(ElapsedTimeMs), + NiceByteRate(chunker.BytesScanned(), ElapsedTimeMs)); + } + }; + + const bool Random = false; + + SUBCASE("threshold method") { test(/* UseThreshold */ true, /* Random */ Random, 2048, 1 * 1024 * 1024); } + + SUBCASE("mod method") { test(/* UseThreshold */ false, /* Random */ Random, 2048, 1 * 1024 * 1024); } +} diff --git a/zen/chunk/chunk.h b/zen/chunk/chunk.h new file mode 100644 index 000000000..f93f7e4f2 --- /dev/null +++ b/zen/chunk/chunk.h @@ -0,0 +1,23 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once +#include <zencore/zencore.h> +#include "../zen.h" + +class ChunkCommand : public ZenCmdBase +{ +public: + ChunkCommand(); + ~ChunkCommand(); + + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + virtual cxxopts::Options* Options() override { return &m_Options; } + +private: + cxxopts::Options m_Options{"chunk", "Do a chunking pass"}; + std::string m_RootDirectory; + std::string m_ScanDirectory; + size_t m_ChunkSize = 0; + size_t m_AverageChunkSize = 0; + bool m_UseCompression = true; +}; diff --git a/zen/cmds/copy.cpp b/zen/cmds/copy.cpp new file mode 100644 index 000000000..21f3f4a19 --- /dev/null +++ b/zen/cmds/copy.cpp @@ -0,0 +1,99 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "copy.h" + +#include <zencore/filesystem.h> +#include <zencore/string.h> +#include <zencore/timer.h> + +#include <spdlog/spdlog.h> + +CopyCommand::CopyCommand() +{ + m_Options.add_options()("h,help", "Print help"); + m_Options.add_options()("no-clone", "Do not perform block clone", cxxopts::value(m_NoClone)->default_value("false")); + m_Options.add_option("", "s", "source", "Copy source", cxxopts::value(m_CopySource), "<file/directory>"); + m_Options.add_option("", "t", "target", "Copy target", cxxopts::value(m_CopyTarget), "<file/directory>"); + m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), ""); +} + +CopyCommand::~CopyCommand() = default; + +int +CopyCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + m_Options.parse_positional({"source", "target", "positional"}); + + auto result = m_Options.parse(argc, argv); + + if (result.count("help")) + { + std::cout << m_Options.help({"", "Group"}) << std::endl; + + return 0; + } + + // Validate arguments + + if (m_CopySource.empty()) + throw std::exception("No source specified"); + + if (m_CopyTarget.empty()) + throw std::exception("No target specified"); + + std::filesystem::path FromPath; + std::filesystem::path ToPath; + + FromPath = m_CopySource; + ToPath = m_CopyTarget; + + const bool IsFileCopy = std::filesystem::is_regular_file(m_CopySource); + const bool IsDirCopy = std::filesystem::is_directory(m_CopySource); + + if (!IsFileCopy && !IsDirCopy) + { + throw std::exception("Invalid source specification (neither directory nor file)"); + } + + if (IsFileCopy && IsDirCopy) + { + throw std::exception("Invalid source specification (both directory AND file!?)"); + } + + if (IsDirCopy) + { + if (std::filesystem::exists(ToPath)) + { + const bool IsTargetDir = std::filesystem::is_directory(ToPath); + if (!IsTargetDir) + { + if (std::filesystem::is_regular_file(ToPath)) + { + throw std::exception("Attempted copy of directory into file"); + } + } + } + else + { + std::filesystem::create_directories(ToPath); + } + } + else + { + // Single file copy + + zen::Stopwatch Timer; + + zen::CopyFileOptions CopyOptions; + CopyOptions.EnableClone = !m_NoClone; + zen::CopyFile(FromPath, ToPath, CopyOptions); + + spdlog::info("Copy completed in {}", zen::NiceTimeSpanMs(Timer.getElapsedTimeMs())); + } + + return 0; +} diff --git a/zen/cmds/copy.h b/zen/cmds/copy.h new file mode 100644 index 000000000..22b240d11 --- /dev/null +++ b/zen/cmds/copy.h @@ -0,0 +1,24 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "../zen.h" + +/** Copy files, possibly using block cloning + */ +class CopyCommand : public ZenCmdBase +{ +public: + CopyCommand(); + ~CopyCommand(); + + virtual cxxopts::Options* Options() override { return &m_Options; } + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + +private: + cxxopts::Options m_Options{"copy", "Copy files"}; + std::vector<std::string> m_Positional; + std::string m_CopySource; + std::string m_CopyTarget; + bool m_NoClone = false; +}; diff --git a/zen/cmds/dedup.cpp b/zen/cmds/dedup.cpp new file mode 100644 index 000000000..1169f22f7 --- /dev/null +++ b/zen/cmds/dedup.cpp @@ -0,0 +1,294 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "dedup.h" + +#include <zencore/blake3.h> +#include <zencore/filesystem.h> +#include <zencore/iobuffer.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> + +#include <ppl.h> +#include <spdlog/spdlog.h> + +DedupCommand::DedupCommand() +{ + m_Options.add_options()("h,help", "Print help"); + m_Options.add_options()("size", "Configure size threshold for dedup", cxxopts::value(m_SizeThreshold)->default_value("131072")); + m_Options.add_option("", "s", "source", "Copy source", cxxopts::value(m_DedupSource), "<file/directory>"); + m_Options.add_option("", "t", "target", "Copy target", cxxopts::value(m_DedupTarget), "<file/directory>"); + m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), ""); +} + +DedupCommand::~DedupCommand() = default; + +int +DedupCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + m_Options.parse_positional({"source", "target", "positional"}); + + auto result = m_Options.parse(argc, argv); + + if (result.count("help")) + { + std::cout << m_Options.help({"", "Group"}) << std::endl; + + return 0; + } + + // Validate arguments + + const bool SourceGood = zen::SupportsBlockRefCounting(m_DedupSource); + const bool TargetGood = zen::SupportsBlockRefCounting(m_DedupTarget); + + if (!SourceGood) + { + spdlog::info("Source directory '{}' does not support deduplication", m_DedupSource); + + return 0; + } + + if (!TargetGood) + { + spdlog::info("Target directory '{}' does not support deduplication", m_DedupTarget); + + return 0; + } + + spdlog::info("Performing dedup operation between {} and {}, size threshold {}", + m_DedupSource, + m_DedupTarget, + zen::NiceBytes(m_SizeThreshold)); + + using DirEntryList_t = std::list<std::filesystem::directory_entry>; + + zen::RwLock MapLock; + std::unordered_map<size_t, DirEntryList_t> FileSizeMap; + size_t CandidateCount = 0; + + auto AddToList = [&](const std::filesystem::directory_entry& Entry) { + if (Entry.is_regular_file()) + { + uintmax_t FileSize = Entry.file_size(); + if (FileSize > m_SizeThreshold) + { + zen::RwLock::ExclusiveLockScope _(MapLock); + FileSizeMap[FileSize].push_back(Entry); + ++CandidateCount; + } + } + }; + + std::filesystem::recursive_directory_iterator DirEnd; + + struct Utf8Helper + { + zen::ExtendableStringBuilder<128> Path8; + + Utf8Helper(const wchar_t* Path) { zen::WideToUtf8(Path, Path8); }; + + std::string_view c_str() { return std::string_view(Path8); }; + }; + + spdlog::info("Gathering file info from source: '{}'", m_DedupSource); + spdlog::info("Gathering file info from target: '{}'", m_DedupTarget); + + { + zen::Stopwatch Timer; + + Concurrency::parallel_invoke( + [&] { + for (std::filesystem::recursive_directory_iterator DirIt1(m_DedupSource); DirIt1 != DirEnd; ++DirIt1) + { + AddToList(*DirIt1); + } + }, + [&] { + for (std::filesystem::recursive_directory_iterator DirIt2(m_DedupTarget); DirIt2 != DirEnd; ++DirIt2) + { + AddToList(*DirIt2); + } + }); + + spdlog::info("Gathered {} candidates across {} size buckets. Elapsed: {}", + CandidateCount, + FileSizeMap.size(), + zen::NiceTimeSpanMs(Timer.getElapsedTimeMs())); + } + + spdlog::info("Sorting buckets by size"); + + zen::Stopwatch Timer; + + uint64_t DupeBytes = 0; + + struct SizeList + { + size_t Size; + DirEntryList_t* DirEntries; + }; + + std::vector<SizeList> SizeLists{FileSizeMap.size()}; + + { + int i = 0; + + for (auto& kv : FileSizeMap) + { + ZEN_ASSERT(kv.first >= m_SizeThreshold); + SizeLists[i].Size = kv.first; + SizeLists[i].DirEntries = &kv.second; + ++i; + } + } + + std::sort(begin(SizeLists), end(SizeLists), [](const SizeList& Lhs, const SizeList& Rhs) { return Lhs.Size > Rhs.Size; }); + + spdlog::info("Bucket summary:"); + + std::vector<size_t> BucketId; + std::vector<size_t> BucketOffsets; + std::vector<size_t> BucketSizes; + std::vector<size_t> BucketFileCounts; + + size_t TotalFileSizes = 0; + size_t TotalFileCount = 0; + + { + size_t CurrentPow2 = 0; + size_t BucketSize = 0; + size_t BucketFileCount = 0; + bool FirstBucket = true; + + for (int i = 0; i < SizeLists.size(); ++i) + { + const size_t ThisSize = SizeLists[i].Size; + const size_t Pow2 = zen::NextPow2(ThisSize); + + if (CurrentPow2 != Pow2) + { + CurrentPow2 = Pow2; + + if (!FirstBucket) + { + BucketSizes.push_back(BucketSize); + BucketFileCounts.push_back(BucketFileCount); + } + + BucketId.push_back(Pow2); + BucketOffsets.push_back(i); + + FirstBucket = false; + BucketSize = 0; + BucketFileCount = 0; + } + + BucketSize += ThisSize; + TotalFileSizes += ThisSize; + BucketFileCount += SizeLists[i].DirEntries->size(); + TotalFileCount += SizeLists[i].DirEntries->size(); + } + + if (!FirstBucket) + { + BucketSizes.push_back(BucketSize); + BucketFileCounts.push_back(BucketFileCount); + } + + ZEN_ASSERT(BucketOffsets.size() == BucketSizes.size()); + ZEN_ASSERT(BucketOffsets.size() == BucketFileCounts.size()); + } + + for (int i = 0; i < BucketOffsets.size(); ++i) + { + spdlog::info(" Bucket {} : {}, {} candidates", zen::NiceBytes(BucketId[i]), zen::NiceBytes(BucketSizes[i]), BucketFileCounts[i]); + } + + spdlog::info("Total : {}, {} candidates", zen::NiceBytes(TotalFileSizes), TotalFileCount); + + std::string CurrentNice; + + for (SizeList& Size : SizeLists) + { + std::string CurNice{zen::NiceBytes(zen::NextPow2(Size.Size))}; + + if (CurNice != CurrentNice) + { + CurrentNice = CurNice; + spdlog::info("Now scanning bucket: {}", CurrentNice); + } + + std::unordered_map<zen::BLAKE3, const std::filesystem::directory_entry*, zen::BLAKE3::Hasher> DedupMap; + + for (const auto& Entry : *Size.DirEntries) + { + zen::BLAKE3 Hash; + + if constexpr (true) + { + zen::BLAKE3Stream b3s; + + zen::ScanFile(Entry.path(), 64 * 1024, [&](const void* Data, size_t Size) { b3s.Append(Data, Size); }); + + Hash = b3s.GetHash(); + } + else + { + zen::FileContents Contents = zen::ReadFile(Entry.path()); + + zen::BLAKE3Stream b3s; + + for (zen::IoBuffer& Buffer : Contents.Data) + { + b3s.Append(Buffer.Data(), Buffer.Size()); + } + Hash = b3s.GetHash(); + } + + if (const std::filesystem::directory_entry* Dupe = DedupMap[Hash]) + { + std::wstring FileA = Dupe->path().c_str(); + std::wstring FileB = Entry.path().c_str(); + + size_t MinLen = std::min(FileA.size(), FileB.size()); + auto Its = std::mismatch(FileB.rbegin(), FileB.rbegin() + MinLen, FileA.rbegin()); + + if (Its.first != FileB.rbegin()) + { + if (Its.first[-1] == '\\' || Its.first[-1] == '/') + --Its.first; + + FileB = std::wstring(FileB.begin(), Its.first.base()) + L"..."; + } + + spdlog::info("{} {} <-> {}", + zen::NiceBytes(Entry.file_size()).c_str(), + Utf8Helper(FileA.c_str()).c_str(), + Utf8Helper(FileB.c_str()).c_str()); + + zen::CopyFileOptions Options; + Options.EnableClone = true; + Options.MustClone = true; + + zen::CopyFile(Dupe->path(), Entry.path(), Options); + + DupeBytes += Entry.file_size(); + } + else + { + DedupMap[Hash] = &Entry; + } + } + + Size.DirEntries->clear(); + } + + spdlog::info("Elapsed: {} Deduped: {}", zen::NiceTimeSpanMs(Timer.getElapsedTimeMs()), zen::NiceBytes(DupeBytes)); + + return 0; +} diff --git a/zen/cmds/dedup.h b/zen/cmds/dedup.h new file mode 100644 index 000000000..0f0aecc8e --- /dev/null +++ b/zen/cmds/dedup.h @@ -0,0 +1,26 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "../zen.h" + +#include <ppl.h> + +/** Deduplicate files in a tree using block cloning + */ +class DedupCommand : public ZenCmdBase +{ +public: + DedupCommand(); + ~DedupCommand(); + + virtual cxxopts::Options* Options() override { return &m_Options; } + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + +private: + cxxopts::Options m_Options{"dedup", "Deduplicate files"}; + std::vector<std::string> m_Positional; + std::string m_DedupSource; + std::string m_DedupTarget; + size_t m_SizeThreshold = 1024 * 1024; +}; diff --git a/zen/cmds/deploy.cpp b/zen/cmds/deploy.cpp new file mode 100644 index 000000000..9e39d3084 --- /dev/null +++ b/zen/cmds/deploy.cpp @@ -0,0 +1,85 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "deploy.h" + +#include <zencore/string.h> + +#include <spdlog/spdlog.h> + +DeployCommand::DeployCommand() +{ + m_Options.add_options()("h,help", "Print help"); + m_Options.add_options()("no-clone", "Do not perform block clone", cxxopts::value(m_NoClone)->default_value("false")); + m_Options.add_options()("clean", + "Make clean deploy (i.e remove anything in target first)", + cxxopts::value(m_IsClean)->default_value("false")); + m_Options.add_option("", "s", "source", "Deploy source", cxxopts::value(m_CopySource), "<build store>"); + m_Options.add_option("", "t", "target", "Deploy target", cxxopts::value(m_CopyTarget), "<directory>"); + m_Options.add_option("", "", "positional", "Positional arguments", cxxopts::value(m_Positional), ""); +} + +DeployCommand::~DeployCommand() = default; + +int +DeployCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + m_Options.parse_positional({"source", "target", "positional"}); + + auto result = m_Options.parse(argc, argv); + + if (result.count("help")) + { + std::cout << m_Options.help({"", "Group"}) << std::endl; + + return 0; + } + + // Validate arguments + + if (m_CopySource.empty()) + throw std::exception("No source specified"); + + if (m_CopyTarget.empty()) + throw std::exception("No target specified"); + + std::filesystem::path ToPath; + + ToPath = m_CopyTarget; + + const bool IsTargetDir = std::filesystem::is_directory(ToPath); + bool IsTargetNew = !std::filesystem::exists(ToPath); + + if (!IsTargetNew && !IsTargetDir) + { + throw std::exception("Invalid target specification (needs to be a directory)"); + } + + zen::ExtendableStringBuilder<128> Path8; + zen::WideToUtf8(ToPath.c_str(), Path8); + + if (IsTargetNew == false && m_IsClean) + { + spdlog::info("Clean deploy -- deleting directory {}", Path8.c_str()); + + std::filesystem::remove_all(ToPath); + + IsTargetNew = true; // Create fresh new directory + } + + if (IsTargetNew) + { + spdlog::info("Creating directory {}", Path8.c_str()); + + std::filesystem::create_directories(ToPath); + } + + spdlog::info("Starting deploy operation..."); + + // TODO: implement! + + return 0; +} diff --git a/zen/cmds/deploy.h b/zen/cmds/deploy.h new file mode 100644 index 000000000..1109aaf17 --- /dev/null +++ b/zen/cmds/deploy.h @@ -0,0 +1,25 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "../zen.h" + +/** Deploy files from Zen build store + */ +class DeployCommand : public ZenCmdBase +{ +public: + DeployCommand(); + ~DeployCommand(); + + virtual cxxopts::Options* Options() override { return &m_Options; } + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + +private: + cxxopts::Options m_Options{"deploy", "Deploy cooked data"}; + std::vector<std::string> m_Positional; + std::string m_CopySource; + std::string m_CopyTarget; + bool m_NoClone = false; + bool m_IsClean = false; +}; diff --git a/zen/cmds/hash.cpp b/zen/cmds/hash.cpp new file mode 100644 index 000000000..b270bec30 --- /dev/null +++ b/zen/cmds/hash.cpp @@ -0,0 +1,127 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "hash.h" + +#include <zencore/blake3.h> +#include <zencore/string.h> +#include <zencore/timer.h> + +#include <ppl.h> +#include <spdlog/spdlog.h> + +HashCommand::HashCommand() +{ + m_Options.add_options()("d,dir", "Directory to scan", cxxopts::value<std::string>(m_ScanDirectory))( + "o,output", + "Output file", + cxxopts::value<std::string>(m_OutputFile)); +} + +HashCommand::~HashCommand() = default; + +int +HashCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + ZEN_UNUSED(GlobalOptions); + + auto result = m_Options.parse(argc, argv); + + bool valid = m_ScanDirectory.length(); + + if (!valid) + throw cxxopts::OptionParseException("Chunk command requires a directory to scan"); + + // Gather list of files to process + + spdlog::info("Gathering files from {}", m_ScanDirectory); + + struct FileEntry + { + std::filesystem::path FilePath; + zen::BLAKE3 FileHash; + }; + + std::vector<FileEntry> FileList; + uint64_t FileBytes = 0; + + std::filesystem::path ScanDirectoryPath{m_ScanDirectory}; + + for (const std::filesystem::directory_entry& Entry : std::filesystem::recursive_directory_iterator(ScanDirectoryPath)) + { + if (Entry.is_regular_file()) + { + FileList.push_back({Entry.path()}); + FileBytes += Entry.file_size(); + } + } + + spdlog::info("Gathered {} files, total size {}", FileList.size(), zen::NiceBytes(FileBytes)); + + Concurrency::combinable<uint64_t> TotalBytes; + + auto hashFile = [&](FileEntry& File) { + InternalFile InputFile; + InputFile.OpenRead(File.FilePath); + const uint8_t* DataPointer = (const uint8_t*)InputFile.MemoryMapFile(); + const size_t DataSize = InputFile.GetFileSize(); + + File.FileHash = zen::BLAKE3::HashMemory(DataPointer, DataSize); + + TotalBytes.local() += DataSize; + }; + + // Process them as quickly as possible + + zen::Stopwatch Timer; + +#if 1 + Concurrency::parallel_for_each(begin(FileList), end(FileList), [&](auto& file) { hashFile(file); }); +#else + for (const auto& file : FileList) + { + hashFile(file); + } +#endif + + size_t TotalByteCount = 0; + + TotalBytes.combine_each([&](size_t Total) { TotalByteCount += Total; }); + + const uint64_t ElapsedMs = Timer.getElapsedTimeMs(); + spdlog::info("Scanned {} files in {}", FileList.size(), zen::NiceTimeSpanMs(ElapsedMs)); + spdlog::info("Total bytes {} ({})", zen::NiceBytes(TotalByteCount), zen::NiceByteRate(TotalByteCount, ElapsedMs)); + + InternalFile Output; + + if (m_OutputFile.empty()) + { + // TEMPORARY -- should properly open stdout + Output.OpenWrite("CONOUT$", false); + } + else + { + Output.OpenWrite(m_OutputFile, true); + } + + zen::ExtendableStringBuilder<256> Line; + + uint64_t CurrentOffset = 0; + + for (const auto& File : FileList) + { + Line.Append(File.FilePath.generic_u8string().c_str()); + Line.Append(','); + File.FileHash.ToHexString(Line); + Line.Append('\n'); + + Output.Write(Line.Data(), Line.Size(), CurrentOffset); + CurrentOffset += Line.Size(); + + Line.Reset(); + } + + // TODO: implement snapshot enumeration and display + return 0; +} diff --git a/zen/cmds/hash.h b/zen/cmds/hash.h new file mode 100644 index 000000000..b994b497d --- /dev/null +++ b/zen/cmds/hash.h @@ -0,0 +1,25 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "../internalfile.h" +#include "../zen.h" + +#include <ppl.h> + +/** Generate hash list file + */ +class HashCommand : public ZenCmdBase +{ +public: + HashCommand(); + ~HashCommand(); + + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + virtual cxxopts::Options* Options() override { return &m_Options; } + +private: + cxxopts::Options m_Options{"hash", "Hash files"}; + std::string m_ScanDirectory; + std::string m_OutputFile; +}; diff --git a/zen/cmds/run.cpp b/zen/cmds/run.cpp new file mode 100644 index 000000000..b9bb8e9a3 --- /dev/null +++ b/zen/cmds/run.cpp @@ -0,0 +1,185 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS // for <cstdbool> include warning triggered by cpr + +#include "run.h" + +#include <zencore/compactbinarybuilder.h> +#include <zencore/except.h> +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/timer.h> +#include <zenserverprocess.h> + +#include <spdlog/spdlog.h> +#include <filesystem> + +// cpr //////////////////////////////////////////////////////////////////// +// +// For some reason, these don't seem to stick, so we disable the warnings +//# define _SILENCE_CXX17_C_HEADER_DEPRECATION_WARNING 1 +//# define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4004) +#pragma warning(disable : 4996) +#include <cpr/cpr.h> +#pragma warning(pop) + +#if ZEN_PLATFORM_WINDOWS +# pragma comment(lib, "Crypt32.lib") +# pragma comment(lib, "Wldap32.lib") +# pragma comment(lib, "Ws2_32.lib") +#endif + +////////////////////////////////////////////////////////////////////////// + +using namespace std::literals; + +RunCommand::RunCommand() +{ + m_Options.add_options()("h,host", "Host to run on", cxxopts::value<std::string>(m_TargetHost))("d,dir", + "Tree to run", + cxxopts::value<std::string>(m_ExeTree)); +} + +RunCommand::~RunCommand() = default; + +void +CreateTreeManifest(std::filesystem::path RootPath) +{ +} + +int +RunCommand::Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) +{ + if (GlobalOptions.PassthroughV.empty()) + { + throw cxxopts::OptionParseException("run command requires a command to run!"); + } + + ZenTestEnvironment TestEnv; + std::filesystem::path ProgramBaseDir = std::filesystem::path(argv[0]).parent_path(); + std::filesystem::path TestBaseDir = ProgramBaseDir.parent_path().parent_path() / ".test"; + TestEnv.Initialize(ProgramBaseDir, TestBaseDir); + + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + ZenServerInstance Zen1(TestEnv); + Zen1.SetTestDir(TestDir); + Zen1.SpawnServer(13337); + + auto result = m_Options.parse(argc, argv); + + std::filesystem::path TreePath{m_ExeTree}; + + struct Visitor : public zen::FileSystemTraversal::TreeVisitor + { + const std::filesystem::path& m_RootPath; + + Visitor(const std::filesystem::path& RootPath) : m_RootPath(RootPath) {} + + virtual void VisitFile(const std::filesystem::path& Parent, const std::wstring_view& FileName, uint64_t FileSize) override + { + std::filesystem::path FullPath = Parent / FileName; + + zen::IoHashStream Ios; + zen::ScanFile(FullPath, 64 * 1024, [&](const void* Data, size_t Size) { Ios.Append(Data, Size); }); + zen::IoHash Hash = Ios.GetHash(); + + std::wstring RelativePath = FullPath.lexically_relative(m_RootPath).native(); + // spdlog::info("File: {:32} => {} ({})", zen::WideToUtf8(RelativePath), Hash, FileSize); + + FileEntry& Entry = m_Files[RelativePath]; + Entry.Hash = Hash; + Entry.Size = FileSize; + + m_HashToFile[Hash] = FullPath; + } + + virtual bool VisitDirectory(const std::filesystem::path& Parent, const std::wstring_view& DirectoryName) override + { + std::filesystem::path FullPath = Parent / DirectoryName; + + if (DirectoryName.starts_with(L".")) + { + return false; + } + + return true; + } + + struct FileEntry + { + uint64_t Size; + zen::IoHash Hash; + }; + + std::map<std::wstring, FileEntry> m_Files; + std::unordered_map<zen::IoHash, std::filesystem::path, zen::IoHash::Hasher> m_HashToFile; + }; + + zen::FileSystemTraversal Traversal; + Visitor Visit(TreePath); + Traversal.TraverseFileSystem(TreePath, Visit); + + zen::CbObjectWriter PrepReq; + PrepReq << "cmd" << GlobalOptions.PassthroughV[0]; + PrepReq << "args" << GlobalOptions.PassthroughArgs; + PrepReq.BeginArray("files"); + + for (const auto& Kv : Visit.m_Files) + { + PrepReq.BeginObject(); + PrepReq << "file" << zen::WideToUtf8(Kv.first) << "size" << Kv.second.Size << "hash" << Kv.second.Hash; + PrepReq.EndObject(); + } + PrepReq.EndArray(); + + zen::MemoryOutStream MemOut; + zen::BinaryWriter MemWriter(MemOut); + PrepReq.Save(MemWriter); + + Zen1.WaitUntilReady(); + + cpr::Response Response = + cpr::Post(cpr::Url("http://localhost:13337/exec/jobs/prep"), cpr::Body((const char*)MemOut.Data(), MemOut.Size())); + + if (Response.status_code < 300) + { + zen::IoBuffer Payload(zen::IoBuffer::Clone, Response.text.data(), Response.text.size()); + zen::CbObject Result = zen::LoadCompactBinaryObject(Payload); + + for (auto& Need : Result["need"]) + { + zen::IoHash NeedHash = Need.AsHash(); + + if (auto It = Visit.m_HashToFile.find(NeedHash); It != Visit.m_HashToFile.end()) + { + zen::IoBuffer FileData = zen::IoBufferBuilder::MakeFromFile(It->second.c_str()); + + cpr::Response CasResponse = + cpr::Post(cpr::Url("http://localhost:13337/cas"), cpr::Body((const char*)FileData.Data(), FileData.Size())); + + if (CasResponse.status_code >= 300) + { + spdlog::error("CAS put failed with {}", CasResponse.status_code); + } + } + else + { + spdlog::error("unknown hash in 'need' list: {}", NeedHash); + } + } + } + + cpr::Response JobResponse = + cpr::Post(cpr::Url("http://localhost:13337/exec/jobs"), cpr::Body((const char*)MemOut.Data(), MemOut.Size())); + + spdlog::info("job exec: {}", JobResponse.status_code); + + return 0; +} diff --git a/zen/cmds/run.h b/zen/cmds/run.h new file mode 100644 index 000000000..8fa1f6ae9 --- /dev/null +++ b/zen/cmds/run.h @@ -0,0 +1,25 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "../internalfile.h" +#include "../zen.h" + +#include <ppl.h> + +/** Execute a command (using Zen) + */ +class RunCommand : public ZenCmdBase +{ +public: + RunCommand(); + ~RunCommand(); + + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override; + virtual cxxopts::Options* Options() override { return &m_Options; } + +private: + cxxopts::Options m_Options{"run", "Run command"}; + std::string m_TargetHost; + std::string m_ExeTree; +}; diff --git a/zen/internalfile.cpp b/zen/internalfile.cpp new file mode 100644 index 000000000..44c60511e --- /dev/null +++ b/zen/internalfile.cpp @@ -0,0 +1,222 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "internalfile.h" + +#include <zencore/windows.h> + +#include <gsl/gsl-lite.hpp> + +#define ZEN_USE_SLIST ZEN_PLATFORM_WINDOWS + +#if ZEN_USE_SLIST == 0 +struct FileBufferManager::Impl +{ + zen::RwLock m_Lock; + std::list<zen::IoBuffer> m_FreeBuffers; + + uint64_t m_BufferSize; + uint64_t m_MaxBufferCount; + + Impl(uint64_t BufferSize, uint64_t MaxBuffers) : m_BufferSize(BufferSize), m_MaxBufferCount(MaxBuffers) {} + + zen::IoBuffer AllocBuffer() + { + zen::RwLock::ExclusiveLockScope _(m_Lock); + + if (m_FreeBuffers.empty()) + { + return zen::IoBuffer{m_BufferSize, 64 * 1024}; + } + else + { + zen::IoBuffer Buffer = std::move(m_FreeBuffers.front()); + m_FreeBuffers.pop_front(); + return Buffer; + } + } + + void ReturnBuffer(zen::IoBuffer Buffer) + { + zen::RwLock::ExclusiveLockScope _(m_Lock); + + m_FreeBuffers.push_front(std::move(Buffer)); + } +}; +#else +struct FileBufferManager::Impl +{ + struct BufferItem + { + SLIST_ENTRY ItemEntry; + zen::IoBuffer Buffer; + }; + + SLIST_HEADER m_FreeList; + uint64_t m_BufferSize; + uint64_t m_MaxBufferCount; + + Impl(uint64_t BufferSize, uint64_t MaxBuffers) : m_BufferSize(BufferSize), m_MaxBufferCount(MaxBuffers) + { + InitializeSListHead(&m_FreeList); + } + + ~Impl() + { + while (SLIST_ENTRY* Entry = InterlockedPopEntrySList(&m_FreeList)) + { + BufferItem* Item = reinterpret_cast<BufferItem*>(Entry); + delete Item; + } + } + + zen::IoBuffer AllocBuffer() + { + SLIST_ENTRY* Entry = InterlockedPopEntrySList(&m_FreeList); + + if (Entry == nullptr) + { + return zen::IoBuffer{m_BufferSize, 64 * 1024}; + } + else + { + BufferItem* Item = reinterpret_cast<BufferItem*>(Entry); + zen::IoBuffer Buffer = std::move(Item->Buffer); + delete Item; // Todo: could keep this around in another list + + return Buffer; + } + } + + void ReturnBuffer(zen::IoBuffer Buffer) + { + BufferItem* Item = new BufferItem{nullptr, std::move(Buffer)}; + + InterlockedPushEntrySList(&m_FreeList, &Item->ItemEntry); + } +}; +#endif + +FileBufferManager::FileBufferManager(uint64_t BufferSize, uint64_t MaxBuffers) +{ + m_Impl = new Impl{BufferSize, MaxBuffers}; +} + +FileBufferManager::~FileBufferManager() +{ + delete m_Impl; +} + +zen::IoBuffer +FileBufferManager::AllocBuffer() +{ + return m_Impl->AllocBuffer(); +} + +void +FileBufferManager::ReturnBuffer(zen::IoBuffer Buffer) +{ + return m_Impl->ReturnBuffer(Buffer); +} + +////////////////////////////////////////////////////////////////////////// + +InternalFile::InternalFile() +{ +} + +InternalFile::~InternalFile() +{ + if (m_memory) + _aligned_free(m_memory); +} + +size_t +InternalFile::GetFileSize() +{ + ULONGLONG sz; + m_file.GetSize(sz); + + return size_t(sz); +} + +void +InternalFile::OpenWrite(std::filesystem::path FileName, bool IsCreate) +{ + const DWORD dwCreationDisposition = IsCreate ? CREATE_ALWAYS : OPEN_EXISTING; + + HRESULT hRes = m_file.Create(FileName.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open file"); + } +} + +void +InternalFile::OpenRead(std::filesystem::path FileName) +{ + const DWORD dwCreationDisposition = OPEN_EXISTING; + + HRESULT hRes = m_file.Create(FileName.c_str(), GENERIC_READ, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open file"); + } +} + +const void* +InternalFile::MemoryMapFile() +{ + auto fileSize = GetFileSize(); + + if (fileSize > 100 * 1024 * 1024) + { + m_mmap.MapFile(m_file); + + return m_mmap.GetData(); + } + + m_memory = _aligned_malloc(fileSize, 64); + Read(m_memory, fileSize, 0); + + return m_memory; +} + +void +InternalFile::Read(void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED ovl{}; + + ovl.Offset = DWORD(Offset & 0xffff'ffffu); + ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_file.Read(Data, gsl::narrow<DWORD>(Size), &ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to read from file" /* TODO: add context */); + } +} + +void +InternalFile::Write(const void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED ovl{}; + + ovl.Offset = DWORD(Offset & 0xffff'ffffu); + ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_file.Write(Data, gsl::narrow<DWORD>(Size), &ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write to file" /* TODO: add context */); + } +} + +void +InternalFile::Flush() +{ + m_file.Flush(); +} diff --git a/zen/internalfile.h b/zen/internalfile.h new file mode 100644 index 000000000..2d1f9b00f --- /dev/null +++ b/zen/internalfile.h @@ -0,0 +1,58 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#pragma warning(push) +#pragma warning(disable : 4267) // warning C4267: '=': conversion from 'size_t' to 'US', possible loss of data +#include <cxxopts.hpp> +#pragma warning(pop) + +#include <zencore/iobuffer.h> +#include <zencore/refcount.h> +#include <zencore/thread.h> +#include <zencore/windows.h> + +#include <atlfile.h> +#include <filesystem> +#include <list> + +////////////////////////////////////////////////////////////////////////// + +class FileBufferManager : public zen::RefCounted +{ +public: + FileBufferManager(uint64_t BufferSize, uint64_t MaxBufferCount); + ~FileBufferManager(); + + zen::IoBuffer AllocBuffer(); + void ReturnBuffer(zen::IoBuffer Buffer); + +private: + struct Impl; + + Impl* m_Impl; +}; + +class InternalFile : public zen::RefCounted +{ +public: + InternalFile(); + ~InternalFile(); + + void OpenRead(std::filesystem::path FileName); + void Read(void* Data, uint64_t Size, uint64_t Offset); + + void OpenWrite(std::filesystem::path FileName, bool isCreate); + void Write(const void* Data, uint64_t Size, uint64_t Offset); + + void Flush(); + void* Handle() { return m_file; } + + const void* MemoryMapFile(); + size_t GetFileSize(); + +private: + CAtlFile m_file; + CAtlFileMappingBase m_mmap; + void* m_memory = nullptr; +}; diff --git a/zen/zen.cpp b/zen/zen.cpp new file mode 100644 index 000000000..f61973e1b --- /dev/null +++ b/zen/zen.cpp @@ -0,0 +1,300 @@ +// Zen command line client utility +// + +#define DOCTEST_CONFIG_IMPLEMENT +#include <doctest/doctest.h> +#undef DOCTEST_CONFIG_IMPLEMENT + +#include "zen.h" + +#include "chunk/chunk.h" +#include "cmds/copy.h" +#include "cmds/dedup.h" +#include "cmds/deploy.h" +#include "cmds/hash.h" +#include "cmds/run.h" + +#include <zencore/scopeguard.h> +#include <zencore/string.h> +#include <zenstore/cas.h> + +#if TEST_UWS +# pragma warning(push) +# pragma warning(disable : 4458) +# pragma warning(disable : 4324) +# pragma warning(disable : 4100) +# pragma warning(disable : 4706) +# include <uwebsockets/App.h> +# pragma warning(pop) + +# pragma comment(lib, "Iphlpapi.lib") +# pragma comment(lib, "userenv.lib") +#endif + +#include <spdlog/spdlog.h> +#include <gsl/gsl-lite.hpp> + +#include <mimalloc-new-delete.h> + +////////////////////////////////////////////////////////////////////////// + +class TemplateCommand : public ZenCmdBase +{ +public: + TemplateCommand() { m_Options.add_options()("r,root", "Root directory for CAS pool", cxxopts::value<std::string>(m_RootDirectory)); } + + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override { ZEN_UNUSED(GlobalOptions, argc, argv); } + + virtual cxxopts::Options* Options() override { return &m_Options; } + +private: + cxxopts::Options m_Options{"template", "EDIT THIS COMMAND DESCRIPTION"}; + std::string m_RootDirectory; +}; + +////////////////////////////////////////////////////////////////////////// + +class RunTestsCommand : public ZenCmdBase +{ +public: + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) override + { + ZEN_UNUSED(GlobalOptions); + + // Set output mode to handle virtual terminal sequences + HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE); + if (hOut == INVALID_HANDLE_VALUE) + return GetLastError(); + + DWORD dwMode = 0; + if (!GetConsoleMode(hOut, &dwMode)) + return GetLastError(); + + dwMode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING; + if (!SetConsoleMode(hOut, dwMode)) + return GetLastError(); + + return doctest::Context(argc, argv).run(); + } + + virtual cxxopts::Options* Options() override { return &m_Options; } + +private: + cxxopts::Options m_Options{"runtests", "Run tests"}; +}; + +////////////////////////////////////////////////////////////////////////// +// TODO: should make this Unicode-aware so we can pass anything in on the +// command line. + +int +main(int argc, char** argv) +{ + mi_version(); + +#if TEST_UWS + /* Overly simple hello world app, using multiple threads */ + std::vector<std::thread*> threads(4); + + std::transform(threads.begin(), threads.end(), threads.begin(), [](std::thread* /*t*/) { + return new std::thread([]() { + uWS::App() + .get("/*", + [&](uWS::HttpResponse<false>* res, uWS::HttpRequest*) { + zen::Sleep(1); + res->end("hello, world!"); + }) + .listen(1337, [&](auto* listen_socket) { ZEN_UNUSED(listen_socket); }) + .run(); + }); + }); + + std::for_each(threads.begin(), threads.end(), [](std::thread* t) { t->join(); }); +#endif + ////////////////////////////////////////////////////////////////////////// + + auto _ = zen::MakeGuard([] { spdlog::shutdown(); }); + + HashCommand HashCmd; + CopyCommand CopyCmd; + DedupCommand DedupCmd; + DeployCommand DeployCmd; + ChunkCommand ChunkCmd; + RunTestsCommand RunTestsCmd; + RunCommand RunCmd; + + const struct CommandInfo + { + const char* CmdName; + ZenCmdBase* Cmd; + } Commands[] = { + {"chunk", &ChunkCmd}, + {"copy", &CopyCmd}, + {"deploy", &DeployCmd}, + {"dedup", &DedupCmd}, + {"hash", &HashCmd}, + {"runtests", &RunTestsCmd}, + {"run", &RunCmd}, + }; + + // Build set containing available commands + + std::unordered_set<std::string> CommandSet; + + for (const auto& Cmd : Commands) + CommandSet.insert(Cmd.CmdName); + + // Split command line into options, commands and any pass-through arguments + + std::string Passthrough; + std::vector<std::string> PassthroughV; + + for (int i = 1; i < argc; ++i) + { + if (strcmp(argv[i], "--") == 0) + { + bool IsFirst = true; + zen::ExtendableStringBuilder<256> Line; + + for (int j = i + 1; j < argc; ++j) + { + if (!IsFirst) + { + Line.AppendAscii(" "); + } + + std::string_view ThisArg(argv[j]); + PassthroughV.push_back(std::string(ThisArg)); + + const bool NeedsQuotes = (ThisArg.find(' ') != std::string_view::npos); + + if (NeedsQuotes) + { + Line.AppendAscii("\""); + } + + Line.Append(ThisArg); + + if (NeedsQuotes) + { + Line.AppendAscii("\""); + } + + IsFirst = false; + } + + Passthrough = Line.c_str(); + + // This will "truncate" the arg vector and terminate the loop + argc = i - 1; + } + } + + // Split command line into global vs command options. We do this by simply + // scanning argv for a string we recognise as a command and split it there + + std::vector<char*> CommandArgVec; + CommandArgVec.push_back(argv[0]); + + for (int i = 1; i < argc; ++i) + { + if (CommandSet.find(argv[i]) != CommandSet.end()) + { + int commandArgCount = /* exec name */ 1 + argc - (i + 1); + CommandArgVec.resize(commandArgCount); + std::copy(argv + i + 1, argv + argc, CommandArgVec.begin() + 1); + + argc = i + 1; + + break; + } + } + + // Parse global CLI arguments + + ZenCliOptions GlobalOptions; + + GlobalOptions.PassthroughArgs = Passthrough; + GlobalOptions.PassthroughV = PassthroughV; + + std::string SubCommand = "<None>"; + + cxxopts::Options Options("zen", "Zen management tool"); + Options.add_options()("d, debug", "Enable debugging", cxxopts::value<bool>(GlobalOptions.IsDebug)); + Options.add_options()("v, verbose", "Enable verbose logging", cxxopts::value<bool>(GlobalOptions.IsVerbose)); + Options.add_options()("help", "Show command line help"); + Options.add_options()("c, command", "Sub command", cxxopts::value<std::string>(SubCommand)); + + Options.parse_positional({"command"}); + + const bool IsNullInvoke = (argc == 1); // If no arguments are passed we want to print usage information + + try + { + auto ParseResult = Options.parse(argc, argv); + + if (ParseResult.count("help") || IsNullInvoke == 1) + { + std::string Help = Options.help(); + + printf("%s\n", Help.c_str()); + + printf("available commands:\n"); + + for (const auto& CmdInfo : Commands) + { + printf("\n-- %s\n%s\n", CmdInfo.CmdName, CmdInfo.Cmd->Options()->help().c_str()); + } + + exit(0); + } + + for (const CommandInfo& CmdInfo : Commands) + { + if (_stricmp(SubCommand.c_str(), CmdInfo.CmdName) == 0) + { + cxxopts::Options* VerbOptions = CmdInfo.Cmd->Options(); + + try + { + return CmdInfo.Cmd->Run(GlobalOptions, (int)CommandArgVec.size(), CommandArgVec.data()); + } + catch (cxxopts::OptionParseException& Ex) + { + if (VerbOptions) + { + std::string help = VerbOptions->help(); + + printf("Error parsing arguments for command '%s': %s\n\n%s", SubCommand.c_str(), Ex.what(), help.c_str()); + + exit(11); + } + else + { + printf("Error parsing arguments for command '%s': %s\n\n", SubCommand.c_str(), Ex.what()); + + exit(11); + } + } + } + } + + printf("Unknown command specified: '%s', exiting\n", SubCommand.c_str()); + } + catch (cxxopts::OptionParseException& Ex) + { + std::string HelpMessage = Options.help(); + + printf("Error parsing snapshot program arguments: %s\n\n%s", Ex.what(), HelpMessage.c_str()); + + return 9; + } + catch (std::exception& Ex) + { + printf("Exception caught from 'main': %s\n", Ex.what()); + + return 10; + } + + return 0; +} diff --git a/zen/zen.h b/zen/zen.h new file mode 100644 index 000000000..c90f3169a --- /dev/null +++ b/zen/zen.h @@ -0,0 +1,31 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#pragma warning(push) +#pragma warning(disable : 4267) // warning C4267: '=': conversion from 'size_t' to 'US', possible loss of data +#include <cxxopts.hpp> +#pragma warning(pop) + +#include <zencore/refcount.h> +#include <zencore/windows.h> + +#include <atlfile.h> +#include <filesystem> + +struct ZenCliOptions +{ + bool IsDebug = false; + bool IsVerbose = false; + + // Arguments after " -- " on command line are passed through and not parsed + std::string PassthroughArgs; + std::vector<std::string> PassthroughV; +}; + +class ZenCmdBase +{ +public: + virtual int Run(const ZenCliOptions& GlobalOptions, int argc, char** argv) = 0; + virtual cxxopts::Options* Options() = 0; +}; diff --git a/zen/zen.vcxproj b/zen/zen.vcxproj new file mode 100644 index 000000000..2614405d7 --- /dev/null +++ b/zen/zen.vcxproj @@ -0,0 +1,128 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{ca7b9e04-a2d3-4a39-a7d7-fb156a2c6a48}</ProjectGuid> + <RootNamespace>zen</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_release.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;..\zentestutil\include</AdditionalIncludeDirectories> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;..\zentestutil\include</AdditionalIncludeDirectories> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="chunk\chunk.cpp" /> + <ClCompile Include="cmds\copy.cpp" /> + <ClCompile Include="cmds\dedup.cpp" /> + <ClCompile Include="cmds\deploy.cpp" /> + <ClCompile Include="cmds\hash.cpp" /> + <ClCompile Include="cmds\run.cpp" /> + <ClCompile Include="internalfile.cpp" /> + <ClCompile Include="zen.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="chunk\chunk.h" /> + <ClInclude Include="cmds\copy.h" /> + <ClInclude Include="cmds\dedup.h" /> + <ClInclude Include="cmds\deploy.h" /> + <ClInclude Include="cmds\hash.h" /> + <ClInclude Include="cmds\run.h" /> + <ClInclude Include="internalfile.h" /> + <ClInclude Include="zen.h" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\zencore\zencore.vcxproj"> + <Project>{d75bf9ab-c61e-4fff-ad59-1563430f05e2}</Project> + </ProjectReference> + <ProjectReference Include="..\zenstore\zenstore.vcxproj"> + <Project>{26cbbaeb-14c1-4efc-877d-80f48215651c}</Project> + </ProjectReference> + <ProjectReference Include="..\zentestutil\zentestutil.vcxproj"> + <Project>{77f8315d-b21d-4db0-9a6f-2d3359f88a70}</Project> + </ProjectReference> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zen/zen.vcxproj.filters b/zen/zen.vcxproj.filters new file mode 100644 index 000000000..562f245d5 --- /dev/null +++ b/zen/zen.vcxproj.filters @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="chunk\chunk.cpp" /> + <ClCompile Include="zen.cpp" /> + <ClCompile Include="internalfile.cpp" /> + <ClCompile Include="cmds\hash.cpp"> + <Filter>cmds</Filter> + </ClCompile> + <ClCompile Include="cmds\dedup.cpp"> + <Filter>cmds</Filter> + </ClCompile> + <ClCompile Include="cmds\deploy.cpp"> + <Filter>cmds</Filter> + </ClCompile> + <ClCompile Include="cmds\copy.cpp"> + <Filter>cmds</Filter> + </ClCompile> + <ClCompile Include="cmds\run.cpp"> + <Filter>cmds</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="chunk\chunk.h" /> + <ClInclude Include="zen.h" /> + <ClInclude Include="internalfile.h" /> + <ClInclude Include="cmds\hash.h"> + <Filter>cmds</Filter> + </ClInclude> + <ClInclude Include="cmds\deploy.h"> + <Filter>cmds</Filter> + </ClInclude> + <ClInclude Include="cmds\dedup.h"> + <Filter>cmds</Filter> + </ClInclude> + <ClInclude Include="cmds\copy.h"> + <Filter>cmds</Filter> + </ClInclude> + <ClInclude Include="cmds\run.h"> + <Filter>cmds</Filter> + </ClInclude> + </ItemGroup> + <ItemGroup> + <Filter Include="cmds"> + <UniqueIdentifier>{2e06a54c-52be-4260-9275-a4232d01a53c}</UniqueIdentifier> + </Filter> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zen_base_debug.props b/zen_base_debug.props new file mode 100644 index 000000000..c147de362 --- /dev/null +++ b/zen_base_debug.props @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ImportGroup Label="PropertySheets" /> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup /> + <ItemDefinitionGroup> + <ClCompile> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + <BasicRuntimeChecks>StackFrameRuntimeCheck</BasicRuntimeChecks> + <SupportJustMyCode>false</SupportJustMyCode> + </ClCompile> + </ItemDefinitionGroup> + <ItemGroup /> +</Project>
\ No newline at end of file diff --git a/zen_base_release.props b/zen_base_release.props new file mode 100644 index 000000000..886ce6890 --- /dev/null +++ b/zen_base_release.props @@ -0,0 +1,12 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ImportGroup Label="PropertySheets" /> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup /> + <ItemDefinitionGroup> + <ClCompile> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + </ClCompile> + </ItemDefinitionGroup> + <ItemGroup /> +</Project>
\ No newline at end of file diff --git a/zencore-test/targetver.h b/zencore-test/targetver.h new file mode 100644 index 000000000..d432d6993 --- /dev/null +++ b/zencore-test/targetver.h @@ -0,0 +1,10 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and +// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. + +#include <SDKDDKVer.h> diff --git a/zencore-test/zencore-test.cpp b/zencore-test/zencore-test.cpp new file mode 100644 index 000000000..8c53cf349 --- /dev/null +++ b/zencore-test/zencore-test.cpp @@ -0,0 +1,21 @@ +// zencore-test.cpp : Defines the entry point for the console application. +// + +#include <zencore/sha1.h> +#include <zencore/zencore.h> + +#define DOCTEST_CONFIG_IMPLEMENT +#include <doctest/doctest.h> +#undef DOCTEST_CONFIG_IMPLEMENT + +void +forceLinkTests() +{ + zencore_forcelinktests(); +} + +int +main(int argc, char* argv[]) +{ + return doctest::Context(argc, argv).run(); +} diff --git a/zencore-test/zencore-test.vcxproj b/zencore-test/zencore-test.vcxproj new file mode 100644 index 000000000..77a4397fe --- /dev/null +++ b/zencore-test/zencore-test.vcxproj @@ -0,0 +1,118 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>15.0</VCProjectVersion> + <ProjectGuid>{C00173DF-B76E-4989-B576-FE2B780B2580}</ProjectGuid> + <Keyword>Win32Proj</Keyword> + <RootNamespace>zencoretest</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>false</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>Disabled</Optimization> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>../doctest/doctest;../zencore/include</AdditionalIncludeDirectories> + <LanguageStandard>stdcpplatest</LanguageStandard> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies> + <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <ProjectReference /> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>MaxSpeed</Optimization> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>../doctest/doctest;../zencore/include</AdditionalIncludeDirectories> + <WholeProgramOptimization>false</WholeProgramOptimization> + <LanguageStandard>stdcpplatest</LanguageStandard> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ShowProgress>NotSet</ShowProgress> + </Link> + <ProjectReference /> + </ItemDefinitionGroup> + <ItemGroup> + <ClInclude Include="targetver.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="zencore-test.cpp" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\zencore\zencore.vcxproj"> + <Project>{d75bf9ab-c61e-4fff-ad59-1563430f05e2}</Project> + </ProjectReference> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zencore-test/zencore-test.vcxproj.filters b/zencore-test/zencore-test.vcxproj.filters new file mode 100644 index 000000000..fc33e90b9 --- /dev/null +++ b/zencore-test/zencore-test.vcxproj.filters @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClInclude Include="targetver.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="zencore-test.cpp" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zencore/blake3.cpp b/zencore/blake3.cpp new file mode 100644 index 000000000..ec5d496d5 --- /dev/null +++ b/zencore/blake3.cpp @@ -0,0 +1,153 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/blake3.h> + +#include <zencore/string.h> +#include <zencore/zencore.h> +#include "../3rdparty/BLAKE3/c/blake3.h" + +#pragma comment(lib, "blake3.lib") + +#include <doctest/doctest.h> +#include <string.h> + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +void +blake3_forcelink() +{ +} + +BLAKE3 BLAKE3::Zero; // Initialized to all zeroes + +BLAKE3 +BLAKE3::HashMemory(const void* data, size_t byteCount) +{ + BLAKE3 b3; + + blake3_hasher b3h; + blake3_hasher_init(&b3h); + blake3_hasher_update(&b3h, data, byteCount); + blake3_hasher_finalize(&b3h, b3.Hash, sizeof b3.Hash); + + return b3; +} + +BLAKE3 +BLAKE3::FromHexString(const char* string) +{ + BLAKE3 b3; + + ParseHexBytes(string, 2 * sizeof b3.Hash, b3.Hash); + + return b3; +} + +const char* +BLAKE3::ToHexString(char* outString /* 40 characters + NUL terminator */) const +{ + ToHexBytes(Hash, sizeof(BLAKE3), outString); + outString[2 * sizeof(BLAKE3)] = '\0'; + + return outString; +} + +StringBuilderBase& +BLAKE3::ToHexString(StringBuilderBase& outBuilder) const +{ + char str[65]; + ToHexString(str); + + outBuilder.AppendRange(str, &str[65]); + + return outBuilder; +} + +BLAKE3Stream::BLAKE3Stream() +{ + blake3_hasher* b3h = reinterpret_cast<blake3_hasher*>(m_HashState); + static_assert(sizeof(blake3_hasher) <= sizeof(m_HashState)); + blake3_hasher_init(b3h); +} + +void +BLAKE3Stream::Reset() +{ + blake3_hasher* b3h = reinterpret_cast<blake3_hasher*>(m_HashState); + blake3_hasher_init(b3h); +} + +BLAKE3Stream& +BLAKE3Stream::Append(const void* data, size_t byteCount) +{ + blake3_hasher* b3h = reinterpret_cast<blake3_hasher*>(m_HashState); + blake3_hasher_update(b3h, data, byteCount); + + return *this; +} + +BLAKE3 +BLAKE3Stream::GetHash() +{ + BLAKE3 b3; + + blake3_hasher* b3h = reinterpret_cast<blake3_hasher*>(m_HashState); + blake3_hasher_finalize(b3h, b3.Hash, sizeof b3.Hash); + + return b3; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +doctest::String +toString(const BLAKE3& value) +{ + char text[2 * sizeof(BLAKE3) + 1]; + value.ToHexString(text); + + return text; +} + +TEST_CASE("BLAKE3") +{ + SUBCASE("Basics") + { + BLAKE3 b3 = BLAKE3::HashMemory(nullptr, 0); + CHECK(BLAKE3::FromHexString("af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262") == b3); + + BLAKE3::String_t b3s; + std::string b3ss = b3.ToHexString(b3s); + CHECK(b3ss == "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"); + } + + SUBCASE("hashes") + { + CHECK(BLAKE3::FromHexString("00307ced6a8b278d5e3a9f77b138d0e9d2209717c9d45b205f427a73565cc5fb") == BLAKE3::HashMemory("abc123", 6)); + CHECK(BLAKE3::FromHexString("a7142c8c3905cd11b1e35105c7ac588b75d6798822f71e1145187ad46f3e8df4") == + BLAKE3::HashMemory("1234567890123456789012345678901234567890", 40)); + CHECK(BLAKE3::FromHexString("70e708532559265c4662d0285e5e0a4be8bd972bd1f255a93ddf342243adc427") == + BLAKE3::HashMemory("The HttpSendHttpResponse function sends an HTTP response to the specified HTTP request.", 87)); + } + + SUBCASE("streamHashes") + { + auto streamHash = [](const void* data, size_t dataBytes) -> BLAKE3 { + BLAKE3Stream b3s; + b3s.Append(data, dataBytes); + return b3s.GetHash(); + }; + + CHECK(BLAKE3::FromHexString("00307ced6a8b278d5e3a9f77b138d0e9d2209717c9d45b205f427a73565cc5fb") == streamHash("abc123", 6)); + CHECK(BLAKE3::FromHexString("a7142c8c3905cd11b1e35105c7ac588b75d6798822f71e1145187ad46f3e8df4") == + streamHash("1234567890123456789012345678901234567890", 40)); + CHECK(BLAKE3::FromHexString("70e708532559265c4662d0285e5e0a4be8bd972bd1f255a93ddf342243adc427") == + streamHash("The HttpSendHttpResponse function sends an HTTP response to the specified HTTP request.", 87)); + } +} + +} // namespace zen diff --git a/zencore/compactbinary.cpp b/zencore/compactbinary.cpp new file mode 100644 index 000000000..4ee9e9281 --- /dev/null +++ b/zencore/compactbinary.cpp @@ -0,0 +1,1279 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/compactbinary.h" + +#include <zencore/endian.h> +#include <zencore/stream.h> +#include <zencore/trace.h> + +#include <doctest/doctest.h> +#include <ryml/ryml.hpp> +#include <string_view> + +namespace zen { + +const int DaysPerMonth[] = {0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; +const int DaysToMonth[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}; + +bool +IsLeapYear(int Year) +{ + if ((Year % 4) == 0) + { + return (((Year % 100) != 0) || ((Year % 400) == 0)); + } + + return false; +} + +void +DateTime::Set(int Year, int Month, int Day, int Hour, int Minute, int Second, int MilliSecond) +{ + int TotalDays = 0; + + if ((Month > 2) && IsLeapYear(Year)) + { + ++TotalDays; + } + + --Year; // the current year is not a full year yet + --Month; // the current month is not a full month yet + + TotalDays += Year * 365; + TotalDays += Year / 4; // leap year day every four years... + TotalDays -= Year / 100; // ...except every 100 years... + TotalDays += Year / 400; // ...but also every 400 years + TotalDays += DaysToMonth[Month]; // days in this year up to last month + TotalDays += Day - 1; // days in this month minus today + + Ticks = TotalDays * TimeSpan::TicksPerDay + Hour * TimeSpan::TicksPerHour + Minute * TimeSpan::TicksPerMinute + + Second * TimeSpan::TicksPerSecond + MilliSecond * TimeSpan::TicksPerMillisecond; +} + +void +TimeSpan::Set(int Days, int Hours, int Minutes, int Seconds, int FractionNano) +{ + int64_t TotalTicks = 0; + + TotalTicks += Days * TicksPerDay; + TotalTicks += Hours * TicksPerHour; + TotalTicks += Minutes * TicksPerMinute; + TotalTicks += Seconds * TicksPerSecond; + TotalTicks += FractionNano / NanosecondsPerTick; + + Ticks = TotalTicks; +} + +////////////////////////////////////////////////////////////////////////// + +namespace usonprivate { + + static constexpr const uint8_t GEmptyObjectPayload[] = {uint8_t(CbFieldType::Object), 0x00}; + static constexpr const uint8_t GEmptyArrayPayload[] = {uint8_t(CbFieldType::Array), 0x01, 0x00}; + + template<typename T> + static constexpr inline T ReadUnaligned(const void* const Memory) + { +#if PLATFORM_SUPPORTS_UNALIGNED_LOADS + return *static_cast<const T*>(Memory); +#else + T Value; + memcpy(&Value, Memory, sizeof(Value)); + return Value; +#endif + } +} // namespace usonprivate + +////////////////////////////////////////////////////////////////////////// + +CbFieldView::CbFieldView(const void* DataPointer, CbFieldType FieldType) +{ + const uint8_t* Bytes = static_cast<const uint8_t*>(DataPointer); + const CbFieldType LocalType = CbFieldTypeOps::HasFieldType(FieldType) ? (CbFieldType(*Bytes++) | CbFieldType::HasFieldType) : FieldType; + + uint32_t NameLenByteCount = 0; + const uint64_t NameLen64 = CbFieldTypeOps::HasFieldName(LocalType) ? ReadVarUInt(Bytes, NameLenByteCount) : 0; + Bytes += NameLen64 + NameLenByteCount; + + Type = LocalType; + NameLen = uint32_t(std::clamp<uint64_t>(NameLen64, 0, ~uint32_t(0))); + Payload = Bytes; +} + +void +CbFieldView::IterateAttachments(std::function<void(CbFieldView)> Visitor) const +{ + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::Object: + case CbFieldType::UniformObject: + return CbObjectView::FromFieldView(*this).IterateAttachments(Visitor); + case CbFieldType::Array: + case CbFieldType::UniformArray: + return CbArrayView::FromFieldView(*this).IterateAttachments(Visitor); + case CbFieldType::CompactBinaryAttachment: + case CbFieldType::BinaryAttachment: + return Visitor(*this); + default: + return; + } +} + +CbObjectView +CbFieldView::AsObjectView() +{ + if (CbFieldTypeOps::IsObject(Type)) + { + Error = CbFieldError::None; + return CbObjectView::FromFieldView(*this); + } + else + { + Error = CbFieldError::TypeError; + return CbObjectView(); + } +} + +CbArrayView +CbFieldView::AsArrayView() +{ + if (CbFieldTypeOps::IsArray(Type)) + { + Error = CbFieldError::None; + return CbArrayView::FromFieldView(*this); + } + else + { + Error = CbFieldError::TypeError; + return CbArrayView(); + } +} + +MemoryView +CbFieldView::AsBinaryView(const MemoryView Default) +{ + if (CbFieldTypeOps::IsBinary(Type)) + { + const uint8_t* const PayloadBytes = static_cast<const uint8_t*>(Payload); + uint32_t ValueSizeByteCount; + const uint64_t ValueSize = ReadVarUInt(PayloadBytes, ValueSizeByteCount); + + Error = CbFieldError::None; + return MemoryView(PayloadBytes + ValueSizeByteCount, ValueSize); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +std::string_view +CbFieldView::AsString(const std::string_view Default) +{ + if (CbFieldTypeOps::IsString(Type)) + { + const char* const PayloadChars = static_cast<const char*>(Payload); + uint32_t ValueSizeByteCount; + const uint64_t ValueSize = ReadVarUInt(PayloadChars, ValueSizeByteCount); + + if (ValueSize >= (uint64_t(1) << 31)) + { + Error = CbFieldError::RangeError; + return Default; + } + + Error = CbFieldError::None; + return std::string_view(PayloadChars + ValueSizeByteCount, ValueSize); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +uint64_t +CbFieldView::AsInteger(const uint64_t Default, const IntegerParams Params) +{ + if (CbFieldTypeOps::IsInteger(Type)) + { + // A shift of a 64-bit value by 64 is undefined so shift by one less because magnitude is never zero. + const uint64_t OutOfRangeMask = uint64_t(-2) << (Params.MagnitudeBits - 1); + const uint64_t IsNegative = uint8_t(Type) & 1; + + uint32_t MagnitudeByteCount; + const uint64_t Magnitude = ReadVarUInt(Payload, MagnitudeByteCount); + const uint64_t Value = Magnitude ^ -int64_t(IsNegative); + + const uint64_t IsInRange = (!(Magnitude & OutOfRangeMask)) & ((!IsNegative) | Params.IsSigned); + Error = IsInRange ? CbFieldError::None : CbFieldError::RangeError; + + const uint64_t UseValueMask = -int64_t(IsInRange); + return (Value & UseValueMask) | (Default & ~UseValueMask); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +float +CbFieldView::AsFloat(const float Default) +{ + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::IntegerPositive: + case CbFieldType::IntegerNegative: + { + const uint64_t IsNegative = uint8_t(Type) & 1; + constexpr uint64_t OutOfRangeMask = ~((uint64_t(1) << /*FLT_MANT_DIG*/ 24) - 1); + + uint32_t MagnitudeByteCount; + const int64_t Magnitude = ReadVarUInt(Payload, MagnitudeByteCount) + IsNegative; + const uint64_t IsInRange = !(Magnitude & OutOfRangeMask); + Error = IsInRange ? CbFieldError::None : CbFieldError::RangeError; + return IsInRange ? float(IsNegative ? -Magnitude : Magnitude) : Default; + } + case CbFieldType::Float32: + { + Error = CbFieldError::None; + const uint32_t Value = FromNetworkOrder(usonprivate::ReadUnaligned<uint32_t>(Payload)); + return reinterpret_cast<const float&>(Value); + } + case CbFieldType::Float64: + Error = CbFieldError::RangeError; + return Default; + default: + Error = CbFieldError::TypeError; + return Default; + } +} + +double +CbFieldView::AsDouble(const double Default) +{ + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::IntegerPositive: + case CbFieldType::IntegerNegative: + { + const uint64_t IsNegative = uint8_t(Type) & 1; + constexpr uint64_t OutOfRangeMask = ~((uint64_t(1) << /*DBL_MANT_DIG*/ 53) - 1); + + uint32_t MagnitudeByteCount; + const int64_t Magnitude = ReadVarUInt(Payload, MagnitudeByteCount) + IsNegative; + const uint64_t IsInRange = !(Magnitude & OutOfRangeMask); + Error = IsInRange ? CbFieldError::None : CbFieldError::RangeError; + return IsInRange ? double(IsNegative ? -Magnitude : Magnitude) : Default; + } + case CbFieldType::Float32: + { + Error = CbFieldError::None; + const uint32_t Value = FromNetworkOrder(usonprivate::ReadUnaligned<uint32_t>(Payload)); + return reinterpret_cast<const float&>(Value); + } + case CbFieldType::Float64: + { + Error = CbFieldError::None; + const uint64_t Value = FromNetworkOrder(usonprivate::ReadUnaligned<uint64_t>(Payload)); + return reinterpret_cast<const double&>(Value); + } + default: + Error = CbFieldError::TypeError; + return Default; + } +} + +bool +CbFieldView::AsBool(const bool bDefault) +{ + const CbFieldType LocalType = Type; + const bool bIsBool = CbFieldTypeOps::IsBool(LocalType); + Error = bIsBool ? CbFieldError::None : CbFieldError::TypeError; + return (uint8_t(bIsBool) & uint8_t(LocalType) & 1) | ((!bIsBool) & bDefault); +} + +IoHash +CbFieldView::AsCompactBinaryAttachment(const IoHash& Default) +{ + if (CbFieldTypeOps::IsCompactBinaryAttachment(Type)) + { + Error = CbFieldError::None; + return IoHash::MakeFrom(Payload); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +IoHash +CbFieldView::AsBinaryAttachment(const IoHash& Default) +{ + if (CbFieldTypeOps::IsBinaryAttachment(Type)) + { + Error = CbFieldError::None; + return IoHash::MakeFrom(Payload); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +IoHash +CbFieldView::AsAttachment(const IoHash& Default) +{ + if (CbFieldTypeOps::IsAttachment(Type)) + { + Error = CbFieldError::None; + return IoHash::MakeFrom(Payload); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +IoHash +CbFieldView::AsHash(const IoHash& Default) +{ + if (CbFieldTypeOps::IsHash(Type)) + { + Error = CbFieldError::None; + return IoHash::MakeFrom(Payload); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +Guid +CbFieldView::AsUuid() +{ + return AsUuid(Guid()); +} + +Guid +CbFieldView::AsUuid(const Guid& Default) +{ + if (CbFieldTypeOps::IsUuid(Type)) + { + Error = CbFieldError::None; + Guid Value; + memcpy(&Value, Payload, sizeof(Guid)); + Value.A = FromNetworkOrder(Value.A); + Value.B = FromNetworkOrder(Value.B); + Value.C = FromNetworkOrder(Value.C); + Value.D = FromNetworkOrder(Value.D); + return Value; + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +Oid +CbFieldView::AsObjectId() +{ + return AsObjectId(Oid()); +} + +Oid +CbFieldView::AsObjectId(const Oid& Default) +{ + if (CbFieldTypeOps::IsObjectId(Type)) + { + Error = CbFieldError::None; + Oid Value; + memcpy(&Value, Payload, sizeof(Oid)); + return Value; + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +int64_t +CbFieldView::AsDateTimeTicks(const int64_t Default) +{ + if (CbFieldTypeOps::IsDateTime(Type)) + { + Error = CbFieldError::None; + return FromNetworkOrder(usonprivate::ReadUnaligned<int64_t>(Payload)); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +DateTime +CbFieldView::AsDateTime() +{ + return DateTime(AsDateTimeTicks(0)); +} + +DateTime +CbFieldView::AsDateTime(DateTime Default) +{ + return DateTime(AsDateTimeTicks(Default.GetTicks())); +} + +int64_t +CbFieldView::AsTimeSpanTicks(const int64_t Default) +{ + if (CbFieldTypeOps::IsTimeSpan(Type)) + { + Error = CbFieldError::None; + return FromNetworkOrder(usonprivate::ReadUnaligned<int64_t>(Payload)); + } + else + { + Error = CbFieldError::TypeError; + return Default; + } +} + +TimeSpan +CbFieldView::AsTimeSpan() +{ + return TimeSpan(AsTimeSpanTicks(0)); +} + +TimeSpan +CbFieldView::AsTimeSpan(TimeSpan Default) +{ + return TimeSpan(AsTimeSpanTicks(Default.GetTicks())); +} + +uint64_t +CbFieldView::GetSize() const +{ + return sizeof(CbFieldType) + GetViewNoType().GetSize(); +} + +uint64_t +CbFieldView::GetPayloadSize() const +{ + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::None: + case CbFieldType::Null: + return 0; + case CbFieldType::Object: + case CbFieldType::UniformObject: + case CbFieldType::Array: + case CbFieldType::UniformArray: + case CbFieldType::Binary: + case CbFieldType::String: + { + uint32_t PayloadSizeByteCount; + const uint64_t PayloadSize = ReadVarUInt(Payload, PayloadSizeByteCount); + return PayloadSize + PayloadSizeByteCount; + } + case CbFieldType::IntegerPositive: + case CbFieldType::IntegerNegative: + return MeasureVarUInt(Payload); + case CbFieldType::Float32: + return 4; + case CbFieldType::Float64: + return 8; + case CbFieldType::BoolFalse: + case CbFieldType::BoolTrue: + return 0; + case CbFieldType::CompactBinaryAttachment: + case CbFieldType::BinaryAttachment: + case CbFieldType::Hash: + return 20; + case CbFieldType::Uuid: + return 16; + case CbFieldType::ObjectId: + return 12; + case CbFieldType::DateTime: + case CbFieldType::TimeSpan: + return 8; + default: + return 0; + } +} + +IoHash +CbFieldView::GetHash() const +{ + IoHashStream HashStream; + GetHash(HashStream); + return HashStream.GetHash(); +} + +void +CbFieldView::GetHash(IoHashStream& Hash) const +{ + const CbFieldType SerializedType = CbFieldTypeOps::GetSerializedType(Type); + Hash.Append(&SerializedType, sizeof(SerializedType)); + auto View = GetViewNoType(); + Hash.Append(View.GetData(), View.GetSize()); +} + +bool +CbFieldView::Equals(const CbFieldView& Other) const +{ + return CbFieldTypeOps::GetSerializedType(Type) == CbFieldTypeOps::GetSerializedType(Other.Type) && + GetViewNoType().EqualBytes(Other.GetViewNoType()); +} + +void +CbFieldView::CopyTo(MutableMemoryView Buffer) const +{ + const MemoryView Source = GetViewNoType(); + ZEN_ASSERT(Buffer.GetSize() == sizeof(CbFieldType) + Source.GetSize()); + // TEXT("A buffer of %" UINT64_FMT " bytes was provided when %" UINT64_FMT " bytes are required"), + // Buffer.GetSize(), + // sizeof(CbFieldType) + Source.GetSize()); + *static_cast<CbFieldType*>(Buffer.GetData()) = CbFieldTypeOps::GetSerializedType(Type); + Buffer.RightChopInline(sizeof(CbFieldType)); + memcpy(Buffer.GetData(), Source.GetData(), Source.GetSize()); +} + +void +CbFieldView::CopyTo(BinaryWriter& Ar) const +{ + const MemoryView Source = GetViewNoType(); + CbFieldType SerializedType = CbFieldTypeOps::GetSerializedType(Type); + Ar.Write(&SerializedType, sizeof(SerializedType)); + Ar.Write(Source.GetData(), Source.GetSize()); +} + +MemoryView +CbFieldView::GetView() const +{ + const uint32_t TypeSize = CbFieldTypeOps::HasFieldType(Type) ? sizeof(CbFieldType) : 0; + const uint32_t NameSize = CbFieldTypeOps::HasFieldName(Type) ? NameLen + MeasureVarUInt(NameLen) : 0; + const uint64_t PayloadSize = GetPayloadSize(); + return MemoryView(static_cast<const uint8_t*>(Payload) - TypeSize - NameSize, TypeSize + NameSize + PayloadSize); +} + +MemoryView +CbFieldView::GetViewNoType() const +{ + const uint32_t NameSize = CbFieldTypeOps::HasFieldName(Type) ? NameLen + MeasureVarUInt(NameLen) : 0; + const uint64_t PayloadSize = GetPayloadSize(); + return MemoryView(static_cast<const uint8_t*>(Payload) - NameSize, NameSize + PayloadSize); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CbArrayView::CbArrayView() : CbFieldView(usonprivate::GEmptyArrayPayload) +{ +} + +uint64_t +CbArrayView::Num() const +{ + const uint8_t* PayloadBytes = static_cast<const uint8_t*>(GetPayload()); + PayloadBytes += MeasureVarUInt(PayloadBytes); + uint32_t NumByteCount; + return ReadVarUInt(PayloadBytes, NumByteCount); +} + +CbFieldViewIterator +CbArrayView::CreateViewIterator() const +{ + const uint8_t* PayloadBytes = static_cast<const uint8_t*>(GetPayload()); + uint32_t PayloadSizeByteCount; + const uint64_t PayloadSize = ReadVarUInt(PayloadBytes, PayloadSizeByteCount); + PayloadBytes += PayloadSizeByteCount; + const uint64_t NumByteCount = MeasureVarUInt(PayloadBytes); + if (PayloadSize > NumByteCount) + { + const void* const PayloadEnd = PayloadBytes + PayloadSize; + PayloadBytes += NumByteCount; + const CbFieldType UniformType = + CbFieldTypeOps::GetType(GetType()) == CbFieldType::UniformArray ? CbFieldType(*PayloadBytes++) : CbFieldType::HasFieldType; + return CbFieldViewIterator::MakeRange(MemoryView(PayloadBytes, PayloadEnd), UniformType); + } + return CbFieldViewIterator(); +} + +void +CbArrayView::VisitFields(ICbVisitor&) +{ +} + +uint64_t +CbArrayView::GetSize() const +{ + return sizeof(CbFieldType) + GetPayloadSize(); +} + +IoHash +CbArrayView::GetHash() const +{ + IoHashStream Hash; + GetHash(Hash); + return Hash.GetHash(); +} + +void +CbArrayView::GetHash(IoHashStream& HashStream) const +{ + const CbFieldType SerializedType = CbFieldTypeOps::GetType(GetType()); + HashStream.Append(&SerializedType, sizeof(SerializedType)); + auto _ = GetPayloadView(); + HashStream.Append(_.GetData(), _.GetSize()); +} + +bool +CbArrayView::Equals(const CbArrayView& Other) const +{ + return CbFieldTypeOps::GetType(GetType()) == CbFieldTypeOps::GetType(Other.GetType()) && + GetPayloadView().EqualBytes(Other.GetPayloadView()); +} + +void +CbArrayView::CopyTo(MutableMemoryView Buffer) const +{ + const MemoryView Source = GetPayloadView(); + ZEN_ASSERT(Buffer.GetSize() == sizeof(CbFieldType) + Source.GetSize()); + // TEXT("Buffer is %" UINT64_FMT " bytes but %" UINT64_FMT " is required."), + // Buffer.GetSize(), + // sizeof(CbFieldType) + Source.GetSize()); + + *static_cast<CbFieldType*>(Buffer.GetData()) = CbFieldTypeOps::GetType(GetType()); + Buffer.RightChopInline(sizeof(CbFieldType)); + memcpy(Buffer.GetData(), Source.GetData(), Source.GetSize()); +} + +void +CbArrayView::CopyTo(BinaryWriter& Ar) const +{ + const MemoryView Source = GetPayloadView(); + CbFieldType SerializedType = CbFieldTypeOps::GetType(GetType()); + Ar.Write(&SerializedType, sizeof(SerializedType)); + Ar.Write(Source.GetData(), Source.GetSize()); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CbObjectView::CbObjectView() : CbFieldView(usonprivate::GEmptyObjectPayload) +{ +} + +CbFieldViewIterator +CbObjectView::CreateViewIterator() const +{ + const uint8_t* PayloadBytes = static_cast<const uint8_t*>(GetPayload()); + uint32_t PayloadSizeByteCount; + const uint64_t PayloadSize = ReadVarUInt(PayloadBytes, PayloadSizeByteCount); + + PayloadBytes += PayloadSizeByteCount; + + if (PayloadSize) + { + const void* const PayloadEnd = PayloadBytes + PayloadSize; + const CbFieldType UniformType = + CbFieldTypeOps::GetType(GetType()) == CbFieldType::UniformObject ? CbFieldType(*PayloadBytes++) : CbFieldType::HasFieldType; + return CbFieldViewIterator::MakeRange(MemoryView(PayloadBytes, PayloadEnd), UniformType); + } + + return CbFieldViewIterator(); +} + +void +CbObjectView::VisitFields(ICbVisitor&) +{ +} + +CbFieldView +CbObjectView::FindView(const std::string_view Name) const +{ + for (const CbFieldView Field : *this) + { + if (Name == Field.GetName()) + { + return Field; + } + } + return CbFieldView(); +} + +CbFieldView +CbObjectView::FindViewIgnoreCase(const std::string_view Name) const +{ + for (const CbFieldView Field : *this) + { + if (Name == Field.GetName()) + { + return Field; + } + } + return CbFieldView(); +} + +uint64_t +CbObjectView::GetSize() const +{ + return sizeof(CbFieldType) + GetPayloadSize(); +} + +IoHash +CbObjectView::GetHash() const +{ + IoHashStream Hash; + GetHash(Hash); + return Hash.GetHash(); +} + +void +CbObjectView::GetHash(IoHashStream& HashStream) const +{ + const CbFieldType SerializedType = CbFieldTypeOps::GetType(GetType()); + HashStream.Append(&SerializedType, sizeof(SerializedType)); + HashStream.Append(GetPayloadView()); +} + +bool +CbObjectView::Equals(const CbObjectView& Other) const +{ + return CbFieldTypeOps::GetType(GetType()) == CbFieldTypeOps::GetType(Other.GetType()) && + GetPayloadView().EqualBytes(Other.GetPayloadView()); +} + +void +CbObjectView::CopyTo(MutableMemoryView Buffer) const +{ + const MemoryView Source = GetPayloadView(); + ZEN_ASSERT(Buffer.GetSize() == (sizeof(CbFieldType) + Source.GetSize())); + // TEXT("Buffer is %" UINT64_FMT " bytes but %" UINT64_FMT " is required."), + // Buffer.GetSize(), + // sizeof(CbFieldType) + Source.GetSize()); + *static_cast<CbFieldType*>(Buffer.GetData()) = CbFieldTypeOps::GetType(GetType()); + Buffer.RightChopInline(sizeof(CbFieldType)); + memcpy(Buffer.GetData(), Source.GetData(), Source.GetSize()); +} + +void +CbObjectView::CopyTo(BinaryWriter& Ar) const +{ + const MemoryView Source = GetPayloadView(); + CbFieldType SerializedType = CbFieldTypeOps::GetType(GetType()); + Ar.Write(&SerializedType, sizeof(SerializedType)); + Ar.Write(Source.GetData(), Source.GetSize()); +} + +////////////////////////////////////////////////////////////////////////// + +template<typename FieldType> +uint64_t +TCbFieldIterator<FieldType>::GetRangeSize() const +{ + MemoryView View; + if (TryGetSerializedRangeView(View)) + { + return View.GetSize(); + } + else + { + uint64_t Size = 0; + for (CbFieldViewIterator It(*this); It; ++It) + { + Size += It.GetSize(); + } + return Size; + } +} + +template<typename FieldType> +IoHash +TCbFieldIterator<FieldType>::GetRangeHash() const +{ + IoHashStream Hash; + GetRangeHash(Hash); + return IoHash(Hash.GetHash()); +} + +template<typename FieldType> +void +TCbFieldIterator<FieldType>::GetRangeHash(IoHashStream& Hash) const +{ + MemoryView View; + if (TryGetSerializedRangeView(View)) + { + Hash.Append(View.GetData(), View.GetSize()); + } + else + { + for (CbFieldViewIterator It(*this); It; ++It) + { + It.GetHash(Hash); + } + } +} + +template<typename FieldType> +void +TCbFieldIterator<FieldType>::CopyRangeTo(MutableMemoryView InBuffer) const +{ + MemoryView Source; + if (TryGetSerializedRangeView(Source)) + { + ZEN_ASSERT(InBuffer.GetSize() == Source.GetSize()); + // TEXT("Buffer is %" UINT64_FMT " bytes but %" UINT64_FMT " is required."), + // InBuffer.GetSize(), + // Source.GetSize()); + memcpy(InBuffer.GetData(), Source.GetData(), Source.GetSize()); + } + else + { + for (CbFieldViewIterator It(*this); It; ++It) + { + const uint64_t Size = It.GetSize(); + It.CopyTo(InBuffer.Left(Size)); + InBuffer.RightChopInline(Size); + } + } +} + +template class TCbFieldIterator<CbFieldView>; +template class TCbFieldIterator<CbField>; + +template<typename FieldType> +void +TCbFieldIterator<FieldType>::IterateRangeAttachments(std::function<void(CbFieldView)> Visitor) const +{ + if (CbFieldTypeOps::HasFieldType(FieldType::GetType())) + { + // Always iterate over non-uniform ranges because we do not know if they contain an attachment. + for (CbFieldViewIterator It(*this); It; ++It) + { + if (CbFieldTypeOps::MayContainAttachments(It.GetType())) + { + It.IterateAttachments(Visitor); + } + } + } + else + { + // Only iterate over uniform ranges if the uniform type may contain an attachment. + if (CbFieldTypeOps::MayContainAttachments(FieldType::GetType())) + { + for (CbFieldViewIterator It(*this); It; ++It) + { + It.IterateAttachments(Visitor); + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CbFieldIterator +CbFieldIterator::CloneRange(const CbFieldViewIterator& It) +{ + MemoryView View; + if (It.TryGetSerializedRangeView(View)) + { + return MakeRange(SharedBuffer::Clone(View)); + } + else + { + UniqueBuffer Buffer = UniqueBuffer::Alloc(It.GetRangeSize()); + It.CopyRangeTo(MutableMemoryView(Buffer.GetData(), Buffer.GetSize())); + return MakeRange(SharedBuffer(std::move(Buffer))); + } +} + +SharedBuffer +CbFieldIterator::GetRangeBuffer() const +{ + const MemoryView RangeView = GetRangeView(); + const SharedBuffer& OuterBuffer = GetOuterBuffer(); + return OuterBuffer.GetView() == RangeView ? OuterBuffer : SharedBuffer::MakeView(RangeView, OuterBuffer); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +uint64_t +MeasureCompactBinary(MemoryView View, CbFieldType Type) +{ + uint64_t Size; + return TryMeasureCompactBinary(View, Type, Size, Type) ? Size : 0; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +bool +TryMeasureCompactBinary(MemoryView View, CbFieldType& OutType, uint64_t& OutSize, CbFieldType Type) +{ + uint64_t Size = 0; + + if (CbFieldTypeOps::HasFieldType(Type)) + { + if (View.GetSize() == 0) + { + OutType = CbFieldType::None; + OutSize = 1; + return false; + } + + Type = *static_cast<const CbFieldType*>(View.GetData()); + View.RightChopInline(1); + Size += 1; + } + + bool bDynamicSize = false; + uint64_t FixedSize = 0; + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::Null: + break; + case CbFieldType::Object: + case CbFieldType::UniformObject: + case CbFieldType::Array: + case CbFieldType::UniformArray: + case CbFieldType::Binary: + case CbFieldType::String: + case CbFieldType::IntegerPositive: + case CbFieldType::IntegerNegative: + bDynamicSize = true; + break; + case CbFieldType::Float32: + FixedSize = 4; + break; + case CbFieldType::Float64: + FixedSize = 8; + break; + case CbFieldType::BoolFalse: + case CbFieldType::BoolTrue: + break; + case CbFieldType::CompactBinaryAttachment: + case CbFieldType::BinaryAttachment: + case CbFieldType::Hash: + FixedSize = 20; + break; + case CbFieldType::Uuid: + FixedSize = 16; + break; + case CbFieldType::ObjectId: + FixedSize = 12; + break; + case CbFieldType::DateTime: + case CbFieldType::TimeSpan: + FixedSize = 8; + break; + case CbFieldType::None: + default: + OutType = CbFieldType::None; + OutSize = 0; + return false; + } + + OutType = Type; + + if (CbFieldTypeOps::HasFieldName(Type)) + { + if (View.GetSize() == 0) + { + OutSize = Size + 1; + return false; + } + + uint32_t NameLenByteCount = MeasureVarUInt(View.GetData()); + if (View.GetSize() < NameLenByteCount) + { + OutSize = Size + NameLenByteCount; + return false; + } + + const uint64_t NameLen = ReadVarUInt(View.GetData(), NameLenByteCount); + const uint64_t NameSize = NameLen + NameLenByteCount; + + if (bDynamicSize && View.GetSize() < NameSize) + { + OutSize = Size + NameSize; + return false; + } + + View.RightChopInline(NameSize); + Size += NameSize; + } + + switch (CbFieldTypeOps::GetType(Type)) + { + case CbFieldType::Object: + case CbFieldType::UniformObject: + case CbFieldType::Array: + case CbFieldType::UniformArray: + case CbFieldType::Binary: + case CbFieldType::String: + if (View.GetSize() == 0) + { + OutSize = Size + 1; + return false; + } + else + { + uint32_t PayloadSizeByteCount = MeasureVarUInt(View.GetData()); + if (View.GetSize() < PayloadSizeByteCount) + { + OutSize = Size + PayloadSizeByteCount; + return false; + } + const uint64_t PayloadSize = ReadVarUInt(View.GetData(), PayloadSizeByteCount); + OutSize = Size + PayloadSize + PayloadSizeByteCount; + } + return true; + + case CbFieldType::IntegerPositive: + case CbFieldType::IntegerNegative: + if (View.GetSize() == 0) + { + OutSize = Size + 1; + return false; + } + OutSize = Size + MeasureVarUInt(View.GetData()); + return true; + + default: + OutSize = Size + FixedSize; + return true; + } +} + +////////////////////////////////////////////////////////////////////////// + +CbField +LoadCompactBinary(BinaryReader& Ar, BufferAllocator Allocator) +{ + std::vector<uint8_t> HeaderBytes; + CbFieldType FieldType; + uint64_t FieldSize = 1; + + // Read in small increments until the total field size is known, to avoid reading too far. + for (;;) + { + const int32_t ReadSize = int32_t(FieldSize - HeaderBytes.size()); + const size_t ReadOffset = HeaderBytes.size(); + HeaderBytes.resize(ReadOffset + ReadSize); + + Ar.Read(HeaderBytes.data() + ReadOffset, ReadSize); + if (TryMeasureCompactBinary(MakeMemoryView(HeaderBytes), FieldType, FieldSize)) + { + break; + } + ZEN_ASSERT(FieldSize > 0, "Failed to load from invalid compact binary data."); + } + + // Allocate the buffer, copy the header, and read the remainder of the field. + UniqueBuffer Buffer = Allocator(FieldSize); + ZEN_ASSERT(Buffer.GetSize() == FieldSize); + MutableMemoryView View = Buffer.GetView(); + memcpy(View.GetData(), HeaderBytes.data(), HeaderBytes.size()); + View.RightChopInline(HeaderBytes.size()); + if (!View.IsEmpty()) + { + Ar.Read(View.GetData(), View.GetSize()); + } + return CbField(SharedBuffer(std::move(Buffer))); +} + +////////////////////////////////////////////////////////////////////////// + +void +SaveCompactBinary(BinaryWriter& Ar, const CbFieldView& Field) +{ + Field.CopyTo(Ar); +} + +void +SaveCompactBinary(BinaryWriter& Ar, const CbArrayView& Array) +{ + Array.CopyTo(Ar); +} + +void +SaveCompactBinary(BinaryWriter& Ar, const CbObjectView& Object) +{ + Object.CopyTo(Ar); +} + +////////////////////////////////////////////////////////////////////////// + +StringBuilderBase& +ToString(CbObjectView& Root, StringBuilderBase& OutString) +{ + ryml::Tree Tree; + + ryml::NodeRef r = Tree.rootref(); + r |= ryml::MAP; + + for (CbFieldViewIterator It = Root.CreateViewIterator(); It; ++It) + { + } + + return OutString; +} + +////////////////////////////////////////////////////////////////////////// + +void +uson_forcelink() +{ +} + +TEST_CASE("uson") +{ + using namespace std::literals; + + SUBCASE("CbField") + { + constexpr CbFieldView DefaultField; + static_assert(!DefaultField.HasName(), "Error in HasName()"); + static_assert(!DefaultField.HasValue(), "Error in HasValue()"); + static_assert(!DefaultField.HasError(), "Error in HasError()"); + static_assert(DefaultField.GetError() == CbFieldError::None, "Error in GetError()"); + + CHECK(DefaultField.GetSize() == 1); + CHECK(DefaultField.GetName().size() == 0); + CHECK(DefaultField.HasName() == false); + CHECK(DefaultField.HasValue() == false); + CHECK(DefaultField.HasError() == false); + CHECK(DefaultField.GetError() == CbFieldError::None); + + const uint8_t Type = (uint8_t)CbFieldType::None; + CHECK(DefaultField.GetHash() == IoHash::HashMemory(&Type, sizeof Type)); + + CHECK(DefaultField.GetView() == MemoryView{}); + MemoryView SerializedView; + CHECK(DefaultField.TryGetSerializedView(SerializedView) == false); + } + + SUBCASE("CbField(None)") + { + CbFieldView NoneField(nullptr, CbFieldType::None); + CHECK(NoneField.GetSize() == 1); + CHECK(NoneField.GetName().size() == 0); + CHECK(NoneField.HasName() == false); + CHECK(NoneField.HasValue() == false); + CHECK(NoneField.HasError() == false); + CHECK(NoneField.GetError() == CbFieldError::None); + CHECK(NoneField.GetHash() == CbFieldView().GetHash()); + CHECK(NoneField.GetView() == MemoryView()); + MemoryView SerializedView; + CHECK(NoneField.TryGetSerializedView(SerializedView) == false); + } + + SUBCASE("CbField(None|Type|Name)") + { + constexpr CbFieldType FieldType = CbFieldType::None | CbFieldType::HasFieldName; + constexpr const char NoneBytes[] = {char(FieldType), 4, 'N', 'a', 'm', 'e'}; + CbFieldView NoneField(NoneBytes); + + CHECK(NoneField.GetSize() == sizeof(NoneBytes)); + CHECK(NoneField.GetName().compare("Name"sv) == 0); + CHECK(NoneField.HasName() == true); + CHECK(NoneField.HasValue() == false); + CHECK(NoneField.GetHash() == IoHash::HashMemory(NoneBytes, sizeof NoneBytes)); + CHECK(NoneField.GetView() == MemoryView(NoneBytes, sizeof NoneBytes)); + MemoryView SerializedView; + CHECK(NoneField.TryGetSerializedView(SerializedView) == true); + CHECK(SerializedView == MemoryView(NoneBytes, sizeof NoneBytes)); + + uint8_t CopyBytes[sizeof(NoneBytes)]; + NoneField.CopyTo(MutableMemoryView(CopyBytes, sizeof CopyBytes)); + CHECK(MemoryView(NoneBytes, sizeof NoneBytes).EqualBytes(MemoryView(CopyBytes, sizeof CopyBytes))); + } + + SUBCASE("CbField(None|Type)") + { + constexpr CbFieldType FieldType = CbFieldType::None; + constexpr const char NoneBytes[] = {char(FieldType)}; + CbFieldView NoneField(NoneBytes); + + CHECK(NoneField.GetSize() == sizeof NoneBytes); + CHECK(NoneField.GetName().size() == 0); + CHECK(NoneField.HasName() == false); + CHECK(NoneField.HasValue() == false); + CHECK(NoneField.GetHash() == CbFieldView().GetHash()); + CHECK(NoneField.GetView() == MemoryView(NoneBytes, sizeof NoneBytes)); + MemoryView SerializedView; + CHECK(NoneField.TryGetSerializedView(SerializedView) == true); + CHECK(SerializedView == MemoryView(NoneBytes, sizeof NoneBytes)); + } + + SUBCASE("CbField(None|Name)") + { + constexpr CbFieldType FieldType = CbFieldType::None | CbFieldType::HasFieldName; + constexpr const char NoneBytes[] = {char(FieldType), 4, 'N', 'a', 'm', 'e'}; + CbFieldView NoneField(NoneBytes + 1, FieldType); + CHECK(NoneField.GetSize() == uint64_t(sizeof NoneBytes)); + CHECK(NoneField.GetName().compare("Name") == 0); + CHECK(NoneField.HasName() == true); + CHECK(NoneField.HasValue() == false); + CHECK(NoneField.GetHash() == IoHash::HashMemory(NoneBytes, sizeof NoneBytes)); + CHECK(NoneField.GetView() == MemoryView(&NoneBytes[1], sizeof NoneBytes - 1)); + MemoryView SerializedView; + CHECK(NoneField.TryGetSerializedView(SerializedView) == false); + + uint8_t CopyBytes[sizeof(NoneBytes)]; + NoneField.CopyTo(MutableMemoryView(CopyBytes, sizeof CopyBytes)); + CHECK(MemoryView(NoneBytes, sizeof NoneBytes).EqualBytes(MemoryView(CopyBytes, sizeof CopyBytes))); + } + + SUBCASE("CbField(None|EmptyName)") + { + constexpr CbFieldType FieldType = CbFieldType::None | CbFieldType::HasFieldName; + constexpr const uint8_t NoneBytes[] = {uint8_t(FieldType), 0}; + CbFieldView NoneField(NoneBytes + 1, FieldType); + CHECK(NoneField.GetSize() == sizeof NoneBytes); + CHECK(NoneField.GetName().empty() == true); + CHECK(NoneField.HasName() == true); + CHECK(NoneField.HasValue() == false); + CHECK(NoneField.GetHash() == IoHash::HashMemory(NoneBytes, sizeof NoneBytes)); + CHECK(NoneField.GetView() == MemoryView(&NoneBytes[1], sizeof NoneBytes - 1)); + MemoryView SerializedView; + CHECK(NoneField.TryGetSerializedView(SerializedView) == false); + } + + static_assert(!std::is_constructible<CbFieldView, const CbObjectView&>::value, "Invalid constructor for CbField"); + static_assert(!std::is_assignable<CbFieldView, const CbObjectView&>::value, "Invalid assignment for CbField"); + static_assert(!std::is_convertible<CbFieldView, CbObjectView>::value, "Invalid conversion to CbObject"); + static_assert(!std::is_assignable<CbObjectView, const CbFieldView&>::value, "Invalid assignment for CbObject"); +} + +TEST_CASE("uson.null") +{ + using namespace std::literals; + + SUBCASE("CbField(Null)") + { + CbFieldView NullField(nullptr, CbFieldType::Null); + CHECK(NullField.GetSize() == 1); + CHECK(NullField.IsNull() == true); + CHECK(NullField.HasValue() == true); + CHECK(NullField.HasError() == false); + CHECK(NullField.GetError() == CbFieldError::None); + const uint8_t Null[]{uint8_t(CbFieldType::Null)}; + CHECK(NullField.GetHash() == IoHash::HashMemory(Null, sizeof Null)); + } + + SUBCASE("CbField(None)") + { + CbFieldView Field; + CHECK(Field.IsNull() == false); + } +} + +} // namespace zen diff --git a/zencore/compactbinarybuilder.cpp b/zencore/compactbinarybuilder.cpp new file mode 100644 index 000000000..d1422e5a2 --- /dev/null +++ b/zencore/compactbinarybuilder.cpp @@ -0,0 +1,1530 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/compactbinarybuilder.h" + +#include <zencore/compactbinarypackage.h> +#include <zencore/compactbinaryvalidation.h> +#include <zencore/endian.h> +#include <zencore/stream.h> +#include <zencore/string.h> + +#define _USE_MATH_DEFINES +#include <math.h> + +#include <doctest/doctest.h> + +namespace zen { + +template<typename T> +uint64_t +AddUninitialized(std::vector<T>& Vector, uint64_t Count) +{ + const uint64_t Offset = Vector.size(); + Vector.resize(Offset + Count); + return Offset; +} + +template<typename T> +uint64_t +Append(std::vector<T>& Vector, const T* Data, uint64_t Count) +{ + const uint64_t Offset = Vector.size(); + Vector.resize(Offset + Count); + + memcpy(Vector.data() + Offset, Data, sizeof(T) * Count); + + return Offset; +} + +////////////////////////////////////////////////////////////////////////// + +enum class CbWriter::StateFlags : uint8_t +{ + None = 0, + /** Whether a name has been written for the current field. */ + Name = 1 << 0, + /** Whether this state is in the process of writing a field. */ + Field = 1 << 1, + /** Whether this state is for array fields. */ + Array = 1 << 2, + /** Whether this state is for object fields. */ + Object = 1 << 3, +}; + +ENUM_CLASS_FLAGS(CbWriter::StateFlags); + +/** Whether the field type can be used in a uniform array or uniform object. */ +static constexpr bool +IsUniformType(const CbFieldType Type) +{ + if (CbFieldTypeOps::HasFieldName(Type)) + { + return true; + } + + switch (Type) + { + case CbFieldType::None: + case CbFieldType::Null: + case CbFieldType::BoolFalse: + case CbFieldType::BoolTrue: + return false; + default: + return true; + } +} + +/** Append the payload from the compact binary value to the array and return its type. */ +static inline CbFieldType +AppendCompactBinary(const CbFieldView& Value, std::vector<uint8_t>& OutData) +{ + struct FCopy : public CbFieldView + { + using CbFieldView::GetPayloadView; + using CbFieldView::GetType; + }; + const FCopy& ValueCopy = static_cast<const FCopy&>(Value); + const MemoryView SourceView = ValueCopy.GetPayloadView(); + const uint64_t TargetOffset = OutData.size(); + OutData.resize(TargetOffset + SourceView.GetSize()); + memcpy(OutData.data() + TargetOffset, SourceView.GetData(), SourceView.GetSize()); + return CbFieldTypeOps::GetType(ValueCopy.GetType()); +} + +CbWriter::CbWriter() +{ + States.emplace_back(); +} + +CbWriter::CbWriter(const int64_t InitialSize) : CbWriter() +{ + Data.reserve(InitialSize); +} + +CbWriter::~CbWriter() +{ +} + +void +CbWriter::Reset() +{ + Data.resize(0); + States.resize(0); + States.emplace_back(); +} + +CbFieldIterator +CbWriter::Save() +{ + const uint64_t Size = GetSaveSize(); + UniqueBuffer Buffer = UniqueBuffer::Alloc(Size); + const CbFieldViewIterator Output = Save(MutableMemoryView(Buffer.GetData(), Buffer.GetSize())); + SharedBuffer SharedBuf(std::move(Buffer)); + + return CbFieldIterator::MakeRangeView(Output, SharedBuf); +} + +CbFieldViewIterator +CbWriter::Save(const MutableMemoryView Buffer) +{ + ZEN_ASSERT(States.size() == 1 && States.back().Flags == StateFlags::None); + // TEXT("It is invalid to save while there are incomplete write operations.")); + ZEN_ASSERT(Data.size() > 0); // TEXT("It is invalid to save when nothing has been written.")); + ZEN_ASSERT(Buffer.GetSize() == Data.size()); + // TEXT("Buffer is %" UINT64_FMT " bytes but %" INT64_FMT " is required."), + // Buffer.GetSize(), + // Data.Num()); + memcpy(Buffer.GetData(), Data.data(), Data.size()); + return CbFieldViewIterator::MakeRange(Buffer); +} + +void +CbWriter::Save(BinaryWriter& Writer) +{ + ZEN_ASSERT(States.size() == 1 && States.back().Flags == StateFlags::None); + // TEXT("It is invalid to save while there are incomplete write operations.")); + ZEN_ASSERT(Data.size() > 0); // TEXT("It is invalid to save when nothing has been written.")); + Writer.Write(Data.data(), Data.size()); +} + +uint64_t +CbWriter::GetSaveSize() const +{ + return Data.size(); +} + +void +CbWriter::BeginField() +{ + WriterState& State = States.back(); + if ((State.Flags & StateFlags::Field) == StateFlags::None) + { + State.Flags |= StateFlags::Field; + State.Offset = Data.size(); + Data.push_back(0); + } + else + { + ZEN_ASSERT((State.Flags & StateFlags::Name) == StateFlags::Name); + // TEXT("A new field cannot be written until the previous field '%.*hs' is finished."), + // GetActiveName().Len(), + // GetActiveName().GetData()); + } +} + +void +CbWriter::EndField(CbFieldType Type) +{ + WriterState& State = States.back(); + + if ((State.Flags & StateFlags::Name) == StateFlags::Name) + { + Type |= CbFieldType::HasFieldName; + } + else + { + ZEN_ASSERT((State.Flags & StateFlags::Object) == StateFlags::None); + // TEXT("It is invalid to write an object field without a unique non-empty name.")); + } + + if (State.Count == 0) + { + State.UniformType = Type; + } + else if (State.UniformType != Type) + { + State.UniformType = CbFieldType::None; + } + + State.Flags &= ~(StateFlags::Name | StateFlags::Field); + ++State.Count; + Data[State.Offset] = uint8_t(Type); +} + +ZEN_NOINLINE +CbWriter& +CbWriter::SetName(const std::string_view Name) +{ + WriterState& State = States.back(); + ZEN_ASSERT((State.Flags & StateFlags::Array) != StateFlags::Array); + // TEXT("It is invalid to write a name for an array field. Name '%.*hs'"), + // Name.Len(), + // Name.GetData()); + ZEN_ASSERT(!Name.empty()); + // TEXT("%s"), + //(State.Flags & EStateFlags::Object) == EStateFlags::Object + // ? TEXT("It is invalid to write an empty name for an object field. Specify a unique non-empty name.") + // : TEXT("It is invalid to write an empty name for a top-level field. Specify a name or avoid this call.")); + ZEN_ASSERT((State.Flags & (StateFlags::Name | StateFlags::Field)) == StateFlags::None); + // TEXT("A new field '%.*hs' cannot be written until the previous field '%.*hs' is finished."), + // Name.Len(), + // Name.GetData(), + // GetActiveName().Len(), + // GetActiveName().GetData()); + + BeginField(); + State.Flags |= StateFlags::Name; + const uint32_t NameLenByteCount = MeasureVarUInt(uint32_t(Name.size())); + const int64_t NameLenOffset = Data.size(); + Data.resize(NameLenOffset + NameLenByteCount); + + WriteVarUInt(uint64_t(Name.size()), Data.data() + NameLenOffset); + + const uint8_t* NamePtr = reinterpret_cast<const uint8_t*>(Name.data()); + Data.insert(Data.end(), NamePtr, NamePtr + Name.size()); + return *this; +} + +void +CbWriter::SetNameOrAddString(const std::string_view NameOrValue) +{ + // A name is only written if it would begin a new field inside of an object. + if ((States.back().Flags & (StateFlags::Name | StateFlags::Field | StateFlags::Object)) == StateFlags::Object) + { + SetName(NameOrValue); + } + else + { + AddString(NameOrValue); + } +} + +std::string_view +CbWriter::GetActiveName() const +{ + const WriterState& State = States.back(); + if ((State.Flags & StateFlags::Name) == StateFlags::Name) + { + const uint8_t* const EncodedName = Data.data() + State.Offset + sizeof(CbFieldType); + uint32_t NameLenByteCount; + const uint64_t NameLen = ReadVarUInt(EncodedName, NameLenByteCount); + const size_t ClampedNameLen = std::clamp<uint64_t>(NameLen, 0, ~uint64_t(0)); + return std::string_view(reinterpret_cast<const char*>(EncodedName + NameLenByteCount), ClampedNameLen); + } + return std::string_view(); +} + +void +CbWriter::MakeFieldsUniform(const int64_t FieldBeginOffset, const int64_t FieldEndOffset) +{ + MutableMemoryView SourceView(Data.data() + FieldBeginOffset, uint64_t(FieldEndOffset - FieldBeginOffset)); + MutableMemoryView TargetView = SourceView; + TargetView.RightChopInline(sizeof(CbFieldType)); + + while (!SourceView.IsEmpty()) + { + const uint64_t FieldSize = MeasureCompactBinary(SourceView) - sizeof(CbFieldType); + SourceView.RightChopInline(sizeof(CbFieldType)); + if (TargetView.GetData() != SourceView.GetData()) + { + memmove(TargetView.GetData(), SourceView.GetData(), FieldSize); + } + SourceView.RightChopInline(FieldSize); + TargetView.RightChopInline(FieldSize); + } + + if (!TargetView.IsEmpty()) + { + const auto EraseBegin = Data.begin() + (FieldEndOffset - TargetView.GetSize()); + const auto EraseEnd = EraseBegin + TargetView.GetSize(); + + Data.erase(EraseBegin, EraseEnd); + } +} + +void +CbWriter::AddField(const CbFieldView& Value) +{ + ZEN_ASSERT(Value.HasValue()); // , TEXT("It is invalid to write a field with no value.")); + BeginField(); + EndField(AppendCompactBinary(Value, Data)); +} + +void +CbWriter::AddField(const CbField& Value) +{ + AddField(CbFieldView(Value)); +} + +void +CbWriter::BeginObject() +{ + BeginField(); + States.push_back(WriterState()); + States.back().Flags |= StateFlags::Object; +} + +void +CbWriter::EndObject() +{ + ZEN_ASSERT(States.size() > 1 && (States.back().Flags & StateFlags::Object) == StateFlags::Object); + + // TEXT("It is invalid to end an object when an object is not at the top of the stack.")); + ZEN_ASSERT((States.back().Flags & StateFlags::Field) == StateFlags::None); + // TEXT("It is invalid to end an object until the previous field is finished.")); + + const bool bUniform = IsUniformType(States.back().UniformType); + const uint64_t Count = States.back().Count; + States.pop_back(); + + // Calculate the offset of the payload. + const WriterState& State = States.back(); + int64_t PayloadOffset = State.Offset + 1; + if ((State.Flags & StateFlags::Name) == StateFlags::Name) + { + uint32_t NameLenByteCount; + const uint64_t NameLen = ReadVarUInt(Data.data() + PayloadOffset, NameLenByteCount); + PayloadOffset += NameLen + NameLenByteCount; + } + + // Remove redundant field types for uniform objects. + if (bUniform && Count > 1) + { + MakeFieldsUniform(PayloadOffset, Data.size()); + } + + // Insert the object size. + const uint64_t Size = uint64_t(Data.size() - PayloadOffset); + const uint32_t SizeByteCount = MeasureVarUInt(Size); + Data.insert(Data.begin() + PayloadOffset, SizeByteCount, 0); + WriteVarUInt(Size, Data.data() + PayloadOffset); + + EndField(bUniform ? CbFieldType::UniformObject : CbFieldType::Object); +} + +void +CbWriter::AddObject(const CbObjectView& Value) +{ + BeginField(); + EndField(AppendCompactBinary(Value.AsFieldView(), Data)); +} + +void +CbWriter::AddObject(const CbObject& Value) +{ + AddObject(CbObjectView(Value)); +} + +ZEN_NOINLINE +void +CbWriter::BeginArray() +{ + BeginField(); + States.push_back(WriterState()); + States.back().Flags |= StateFlags::Array; +} + +void +CbWriter::EndArray() +{ + ZEN_ASSERT(States.size() > 1 && (States.back().Flags & StateFlags::Array) == StateFlags::Array); + // TEXT("Invalid attempt to end an array when an array is not at the top of the stack.")); + ZEN_ASSERT((States.back().Flags & StateFlags::Field) == StateFlags::None); + // TEXT("It is invalid to end an array until the previous field is finished.")); + const bool bUniform = IsUniformType(States.back().UniformType); + const uint64_t Count = States.back().Count; + States.pop_back(); + + // Calculate the offset of the payload. + const WriterState& State = States.back(); + int64_t PayloadOffset = State.Offset + 1; + if ((State.Flags & StateFlags::Name) == StateFlags::Name) + { + uint32_t NameLenByteCount; + const uint64_t NameLen = ReadVarUInt(Data.data() + PayloadOffset, NameLenByteCount); + PayloadOffset += NameLen + NameLenByteCount; + } + + // Remove redundant field types for uniform arrays. + if (bUniform && Count > 1) + { + MakeFieldsUniform(PayloadOffset, Data.size()); + } + + // Insert the array size and field count. + const uint32_t CountByteCount = MeasureVarUInt(Count); + const uint64_t Size = uint64_t(Data.size() - PayloadOffset) + CountByteCount; + const uint32_t SizeByteCount = MeasureVarUInt(Size); + Data.insert(Data.begin() + PayloadOffset, SizeByteCount + CountByteCount, 0); + WriteVarUInt(Size, Data.data() + PayloadOffset); + WriteVarUInt(Count, Data.data() + PayloadOffset + SizeByteCount); + + EndField(bUniform ? CbFieldType::UniformArray : CbFieldType::Array); +} + +void +CbWriter::AddArray(const CbArrayView& Value) +{ + BeginField(); + EndField(AppendCompactBinary(Value.AsFieldView(), Data)); +} + +void +CbWriter::AddArray(const CbArray& Value) +{ + AddArray(CbArrayView(Value)); +} + +void +CbWriter::AddNull() +{ + BeginField(); + EndField(CbFieldType::Null); +} + +void +CbWriter::AddBinary(const void* const Value, const uint64_t Size) +{ + BeginField(); + const uint32_t SizeByteCount = MeasureVarUInt(Size); + const int64_t SizeOffset = Data.size(); + Data.resize(Data.size() + SizeByteCount); + WriteVarUInt(Size, Data.data() + SizeOffset); + Data.insert(Data.end(), static_cast<const uint8_t*>(Value), static_cast<const uint8_t*>(Value) + Size); + EndField(CbFieldType::Binary); +} + +void +CbWriter::AddBinary(IoBuffer Buffer) +{ + AddBinary(Buffer.Data(), Buffer.Size()); +} + +void +CbWriter::AddBinary(SharedBuffer Buffer) +{ + AddBinary(Buffer.GetData(), Buffer.GetSize()); +} + +void +CbWriter::AddString(const std::string_view Value) +{ + BeginField(); + const uint64_t Size = uint64_t(Value.size()); + const uint32_t SizeByteCount = MeasureVarUInt(Size); + const int64_t Offset = Data.size(); + + Data.resize(Offset + SizeByteCount + Size); + + uint8_t* StringData = Data.data() + Offset; + WriteVarUInt(Size, StringData); + StringData += SizeByteCount; + if (Size > 0) + { + memcpy(StringData, Value.data(), Value.size() * sizeof(char)); + } + EndField(CbFieldType::String); +} + +void +CbWriter::AddString(const std::wstring_view Value) +{ + BeginField(); + ExtendableStringBuilder<128> Utf8; + WideToUtf8(Value, Utf8); + + const uint32_t Size = uint32_t(Utf8.Size()); + const uint32_t SizeByteCount = MeasureVarUInt(Size); + const int64_t Offset = Data.size(); + Data.resize(Offset + SizeByteCount + Size); + uint8_t* StringData = Data.data() + Offset; + WriteVarUInt(Size, StringData); + StringData += SizeByteCount; + if (Size > 0) + { + memcpy(reinterpret_cast<char*>(StringData), Utf8.Data(), Utf8.Size()); + } + EndField(CbFieldType::String); +} + +ZEN_NOINLINE +void +CbWriter::AddInteger(const int32_t Value) +{ + if (Value >= 0) + { + return AddInteger(uint32_t(Value)); + } + BeginField(); + const uint32_t Magnitude = ~uint32_t(Value); + const uint32_t MagnitudeByteCount = MeasureVarUInt(Magnitude); + const int64_t Offset = Data.size(); + Data.resize(Offset + MagnitudeByteCount); + WriteVarUInt(Magnitude, Data.data() + Offset); + EndField(CbFieldType::IntegerNegative); +} + +void +CbWriter::AddInteger(const int64_t Value) +{ + if (Value >= 0) + { + return AddInteger(uint64_t(Value)); + } + BeginField(); + const uint64_t Magnitude = ~uint64_t(Value); + const uint32_t MagnitudeByteCount = MeasureVarUInt(Magnitude); + const uint64_t Offset = AddUninitialized(Data, MagnitudeByteCount); + WriteVarUInt(Magnitude, Data.data() + Offset); + EndField(CbFieldType::IntegerNegative); +} + +ZEN_NOINLINE +void +CbWriter::AddInteger(const uint32_t Value) +{ + BeginField(); + const uint32_t ValueByteCount = MeasureVarUInt(Value); + const uint64_t Offset = AddUninitialized(Data, ValueByteCount); + WriteVarUInt(Value, Data.data() + Offset); + EndField(CbFieldType::IntegerPositive); +} + +ZEN_NOINLINE +void +CbWriter::AddInteger(const uint64_t Value) +{ + BeginField(); + const uint32_t ValueByteCount = MeasureVarUInt(Value); + const uint64_t Offset = AddUninitialized(Data, ValueByteCount); + WriteVarUInt(Value, Data.data() + Offset); + EndField(CbFieldType::IntegerPositive); +} + +ZEN_NOINLINE +void +CbWriter::AddFloat(const float Value) +{ + BeginField(); + const uint32_t RawValue = FromNetworkOrder(reinterpret_cast<const uint32_t&>(Value)); + Append(Data, reinterpret_cast<const uint8_t*>(&RawValue), sizeof(uint32_t)); + EndField(CbFieldType::Float32); +} + +ZEN_NOINLINE +void +CbWriter::AddFloat(const double Value) +{ + const float Value32 = float(Value); + if (Value == double(Value32)) + { + return AddFloat(Value32); + } + BeginField(); + const uint64_t RawValue = FromNetworkOrder(reinterpret_cast<const uint64_t&>(Value)); + Append(Data, reinterpret_cast<const uint8_t*>(&RawValue), sizeof(uint64_t)); + EndField(CbFieldType::Float64); +} + +ZEN_NOINLINE +void +CbWriter::AddBool(const bool bValue) +{ + BeginField(); + EndField(bValue ? CbFieldType::BoolTrue : CbFieldType::BoolFalse); +} + +ZEN_NOINLINE +void +CbWriter::AddCompactBinaryAttachment(const IoHash& Value) +{ + BeginField(); + Append(Data, Value.Hash, sizeof Value.Hash); + EndField(CbFieldType::CompactBinaryAttachment); +} + +ZEN_NOINLINE +void +CbWriter::AddBinaryAttachment(const IoHash& Value) +{ + BeginField(); + Append(Data, Value.Hash, sizeof Value.Hash); + EndField(CbFieldType::BinaryAttachment); +} + +ZEN_NOINLINE +void +CbWriter::AddAttachment(const CbAttachment& Attachment) +{ + BeginField(); + const IoHash& Value = Attachment.GetHash(); + Append(Data, Value.Hash, sizeof Value.Hash); + EndField(CbFieldType::BinaryAttachment); +} + +ZEN_NOINLINE +void +CbWriter::AddHash(const IoHash& Value) +{ + BeginField(); + Append(Data, Value.Hash, sizeof Value.Hash); + EndField(CbFieldType::Hash); +} + +void +CbWriter::AddUuid(const Guid& Value) +{ + const auto AppendSwappedBytes = [this](uint32_t In) { + In = FromNetworkOrder(In); + Append(Data, reinterpret_cast<const uint8_t*>(&In), sizeof In); + }; + BeginField(); + AppendSwappedBytes(Value.A); + AppendSwappedBytes(Value.B); + AppendSwappedBytes(Value.C); + AppendSwappedBytes(Value.D); + EndField(CbFieldType::Uuid); +} + +void +CbWriter::AddObjectId(const Oid& Value) +{ + BeginField(); + Append(Data, reinterpret_cast<const uint8_t*>(&Value.OidBits), sizeof Value.OidBits); + EndField(CbFieldType::ObjectId); +} + +void +CbWriter::AddDateTimeTicks(const int64_t Ticks) +{ + BeginField(); + const uint64_t RawValue = FromNetworkOrder(uint64_t(Ticks)); + Append(Data, reinterpret_cast<const uint8_t*>(&RawValue), sizeof(uint64_t)); + EndField(CbFieldType::DateTime); +} + +void +CbWriter::AddDateTime(const DateTime Value) +{ + AddDateTimeTicks(Value.GetTicks()); +} + +void +CbWriter::AddTimeSpanTicks(const int64_t Ticks) +{ + BeginField(); + const uint64_t RawValue = FromNetworkOrder(uint64_t(Ticks)); + Append(Data, reinterpret_cast<const uint8_t*>(&RawValue), sizeof(uint64_t)); + EndField(CbFieldType::TimeSpan); +} + +void +CbWriter::AddTimeSpan(const TimeSpan Value) +{ + AddTimeSpanTicks(Value.GetTicks()); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CbWriter& +operator<<(CbWriter& Writer, const DateTime Value) +{ + Writer.AddDateTime(Value); + return Writer; +} + +CbWriter& +operator<<(CbWriter& Writer, const TimeSpan Value) +{ + Writer.AddTimeSpan(Value); + return Writer; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void +usonbuilder_forcelink() +{ +} + +doctest::String +toString(const DateTime&) +{ + // TODO:implement + return ""; +} + +doctest::String +toString(const TimeSpan&) +{ + // TODO:implement + return ""; +} + +TEST_CASE("usonbuilder.object") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("EmptyObject") + { + Writer.BeginObject(); + Writer.EndObject(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsObject() == true); + CHECK(Field.AsObjectView().CreateViewIterator().HasValue() == false); + } + + SUBCASE("NamedEmptyObject") + { + Writer.SetName("Object"sv); + Writer.BeginObject(); + Writer.EndObject(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsObject() == true); + CHECK(Field.AsObjectView().CreateViewIterator().HasValue() == false); + } + + SUBCASE("BasicObject") + { + Writer.BeginObject(); + Writer.SetName("Integer"sv).AddInteger(0); + Writer.SetName("Float"sv).AddFloat(0.0f); + Writer.EndObject(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsObject() == true); + + CbObjectView Object = Field.AsObjectView(); + CHECK(Object["Integer"sv].IsInteger() == true); + CHECK(Object["Float"sv].IsFloat() == true); + } + + SUBCASE("UniformObject") + { + Writer.BeginObject(); + Writer.SetName("Field1"sv).AddInteger(0); + Writer.SetName("Field2"sv).AddInteger(1); + Writer.EndObject(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsObject() == true); + + CbObjectView Object = Field.AsObjectView(); + CHECK(Object["Field1"sv].IsInteger() == true); + CHECK(Object["Field2"sv].IsInteger() == true); + } +} + +TEST_CASE("usonbuilder.array") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("EmptyArray") + { + Writer.BeginArray(); + Writer.EndArray(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsArray() == true); + CHECK(Field.AsArrayView().Num() == 0); + } + + SUBCASE("NamedEmptyArray") + { + Writer.SetName("Array"sv); + Writer.BeginArray(); + Writer.EndArray(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsArray() == true); + CHECK(Field.AsArrayView().Num() == 0); + } + + SUBCASE("BasicArray") + { + Writer.BeginArray(); + Writer.AddInteger(0); + Writer.AddFloat(0.0f); + Writer.EndArray(); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsArray() == true); + CbFieldViewIterator Iterator = Field.AsArrayView().CreateViewIterator(); + CHECK(Iterator.IsInteger() == true); + ++Iterator; + CHECK(Iterator.IsFloat() == true); + ++Iterator; + CHECK(Iterator.HasValue() == false); + } + + SUBCASE("UniformArray") + { + Writer.BeginArray(); + Writer.AddInteger(0); + Writer.AddInteger(1); + Writer.EndArray(); + + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.IsArray() == true); + CbFieldViewIterator Iterator = Field.AsArrayView().CreateViewIterator(); + CHECK(Iterator.IsInteger() == true); + ++Iterator; + CHECK(Iterator.IsInteger() == true); + ++Iterator; + CHECK(Iterator.HasValue() == false); + } +} + +TEST_CASE("usonbuilder.null") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("Null") + { + Writer.AddNull(); + CbField Field = Writer.Save(); + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.HasName() == false); + CHECK(Field.IsNull() == true); + } + + SUBCASE("NullWithName") + { + Writer.SetName("Null"sv); + Writer.AddNull(); + CbField Field = Writer.Save(); + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + CHECK(Field.HasName() == true); + CHECK(Field.GetName().compare("Null"sv) == 0); + CHECK(Field.IsNull() == true); + } + + SUBCASE("Null Array/Object Uniformity") + { + Writer.BeginArray(); + Writer.AddNull(); + Writer.AddNull(); + Writer.AddNull(); + Writer.EndArray(); + + Writer.BeginObject(); + Writer.SetName("N1"sv).AddNull(); + Writer.SetName("N2"sv).AddNull(); + Writer.SetName("N3"sv).AddNull(); + Writer.EndObject(); + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + } + + SUBCASE("Null with Save(Buffer)") + { + constexpr int NullCount = 3; + for (int Index = 0; Index < NullCount; ++Index) + { + Writer.AddNull(); + } + uint8_t Buffer[NullCount]{}; + MutableMemoryView BufferView(Buffer, sizeof Buffer); + CbFieldViewIterator Fields = Writer.Save(BufferView); + + CHECK(ValidateCompactBinaryRange(BufferView, CbValidateMode::All) == CbValidateError::None); + + for (int Index = 0; Index < NullCount; ++Index) + { + CHECK(Fields.IsNull() == true); + ++Fields; + } + CHECK(Fields.HasValue() == false); + } +} + +TEST_CASE("usonbuilder.binary") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; +} + +TEST_CASE("usonbuilder.string") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("Empty Strings") + { + Writer.AddString(std::string_view()); + Writer.AddString(std::wstring_view()); + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + for (CbFieldView Field : Fields) + { + CHECK(Field.HasName() == false); + CHECK(Field.IsString() == true); + CHECK(Field.AsString().empty() == true); + } + } + + SUBCASE("Test Basic Strings") + { + Writer.SetName("String"sv).AddString("Value"sv); + Writer.SetName("String"sv).AddString(L"Value"sv); + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + for (CbFieldView Field : Fields) + { + CHECK(Field.GetName().compare("String"sv) == 0); + CHECK(Field.HasName() == true); + CHECK(Field.IsString() == true); + CHECK(Field.AsString().compare("Value"sv) == 0); + } + } + + SUBCASE("Long Strings") + { + constexpr int DotCount = 256; + StringBuilder<DotCount + 1> Dots; + for (int Index = 0; Index < DotCount; ++Index) + { + Dots.Append('.'); + } + Writer.AddString(Dots); + Writer.AddString(std::wstring().append(256, L'.')); + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + for (CbFieldView Field : Fields) + { + CHECK((Field.AsString() == std::string_view(Dots))); + } + } + + SUBCASE("Non-ASCII String") + { + wchar_t Value[2] = {0xd83d, 0xde00}; + Writer.AddString("\xf0\x9f\x98\x80"sv); + Writer.AddString(std::wstring_view(Value, ZEN_ARRAY_COUNT(Value))); + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + for (CbFieldView Field : Fields) + { + CHECK((Field.AsString() == "\xf0\x9f\x98\x80"sv)); + } + } +} + +TEST_CASE("usonbuilder.integer") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + auto TestInt32 = [&Writer](int32_t Value) { + Writer.Reset(); + Writer.AddInteger(Value); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + CHECK(Field.AsInt32() == Value); + CHECK(Field.HasError() == false); + }; + + auto TestUInt32 = [&Writer](uint32_t Value) { + Writer.Reset(); + Writer.AddInteger(Value); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + CHECK(Field.AsUInt32() == Value); + CHECK(Field.HasError() == false); + }; + + auto TestInt64 = [&Writer](int64_t Value) { + Writer.Reset(); + Writer.AddInteger(Value); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + CHECK(Field.AsInt64() == Value); + CHECK(Field.HasError() == false); + }; + + auto TestUInt64 = [&Writer](uint64_t Value) { + Writer.Reset(); + Writer.AddInteger(Value); + CbField Field = Writer.Save(); + + CHECK(ValidateCompactBinary(Field.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + CHECK(Field.AsUInt64() == Value); + CHECK(Field.HasError() == false); + }; + + TestUInt32(uint32_t(0x00)); + TestUInt32(uint32_t(0x7f)); + TestUInt32(uint32_t(0x80)); + TestUInt32(uint32_t(0xff)); + TestUInt32(uint32_t(0x0100)); + TestUInt32(uint32_t(0x7fff)); + TestUInt32(uint32_t(0x8000)); + TestUInt32(uint32_t(0xffff)); + TestUInt32(uint32_t(0x0001'0000)); + TestUInt32(uint32_t(0x7fff'ffff)); + TestUInt32(uint32_t(0x8000'0000)); + TestUInt32(uint32_t(0xffff'ffff)); + + TestUInt64(uint64_t(0x0000'0001'0000'0000)); + TestUInt64(uint64_t(0x7fff'ffff'ffff'ffff)); + TestUInt64(uint64_t(0x8000'0000'0000'0000)); + TestUInt64(uint64_t(0xffff'ffff'ffff'ffff)); + + TestInt32(int32_t(0x01)); + TestInt32(int32_t(0x80)); + TestInt32(int32_t(0x81)); + TestInt32(int32_t(0x8000)); + TestInt32(int32_t(0x8001)); + TestInt32(int32_t(0x7fff'ffff)); + TestInt32(int32_t(0x8000'0000)); + TestInt32(int32_t(0x8000'0001)); + + TestInt64(int64_t(0x0000'0001'0000'0000)); + TestInt64(int64_t(0x8000'0000'0000'0000)); + TestInt64(int64_t(0x7fff'ffff'ffff'ffff)); + TestInt64(int64_t(0x8000'0000'0000'0001)); + TestInt64(int64_t(0xffff'ffff'ffff'ffff)); +} + +TEST_CASE("usonbuilder.float") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("Float32") + { + constexpr float Values[] = { + 0.0f, + 1.0f, + -1.0f, + 3.14159265358979323846f, // PI + 3.402823466e+38f, // FLT_MAX + 1.175494351e-38f // FLT_MIN + }; + + for (float Value : Values) + { + Writer.AddFloat(Value); + } + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + const float* CheckValue = Values; + for (CbFieldView Field : Fields) + { + CHECK(Field.AsFloat() == *CheckValue++); + CHECK(Field.HasError() == false); + } + } + + SUBCASE("Float64") + { + constexpr double Values[] = { + 0.0f, + 1.0f, + -1.0f, + 3.14159265358979323846, // PI + 1.9999998807907104, + 1.9999999403953552, + 3.4028234663852886e38, + 6.8056469327705771e38, + 2.2250738585072014e-308, // DBL_MIN + 1.7976931348623158e+308 // DBL_MAX + }; + + for (double Value : Values) + { + Writer.AddFloat(Value); + } + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + const double* CheckValue = Values; + for (CbFieldView Field : Fields) + { + CHECK(Field.AsDouble() == *CheckValue++); + CHECK(Field.HasError() == false); + } + } +} + +TEST_CASE("usonbuilder.bool") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("Bool") + { + Writer.AddBool(true); + Writer.AddBool(false); + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + CHECK(Fields.AsBool() == true); + CHECK(Fields.HasError() == false); + ++Fields; + CHECK(Fields.AsBool() == false); + CHECK(Fields.HasError() == false); + ++Fields; + CHECK(Fields.HasValue() == false); + } + + SUBCASE("Bool Array/Object Uniformity") + { + Writer.BeginArray(); + Writer.AddBool(false); + Writer.AddBool(false); + Writer.AddBool(false); + Writer.EndArray(); + + Writer.BeginObject(); + Writer.SetName("B1"sv).AddBool(false); + Writer.SetName("B2"sv).AddBool(false); + Writer.SetName("B3"sv).AddBool(false); + Writer.EndObject(); + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + } +} + +TEST_CASE("usonbuilder.usonattachment") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; +} + +TEST_CASE("usonbuilder.binaryattachment") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; +} + +TEST_CASE("usonbuilder.hash") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; +} + +TEST_CASE("usonbuilder.uuid") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; +} + +TEST_CASE("usonbuilder.datetime") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + const DateTime Values[] = {DateTime(0), DateTime(2020, 5, 13, 15, 10)}; + for (DateTime Value : Values) + { + Writer.AddDateTime(Value); + } + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + const DateTime* CheckValue = Values; + for (CbFieldView Field : Fields) + { + CHECK(Field.AsDateTime() == *CheckValue++); + CHECK(Field.HasError() == false); + } +} + +TEST_CASE("usonbuilder.timespan") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + const TimeSpan Values[] = {TimeSpan(0), TimeSpan(1, 2, 4, 8)}; + for (TimeSpan Value : Values) + { + Writer.AddTimeSpan(Value); + } + + CbFieldIterator Fields = Writer.Save(); + + CHECK(ValidateCompactBinary(Fields.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + + const TimeSpan* CheckValue = Values; + for (CbFieldView Field : Fields) + { + CHECK(Field.AsTimeSpan() == *CheckValue++); + CHECK(Field.HasError() == false); + } +} + +TEST_CASE("usonbuilder.complex") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("complex") + { + CbObject Object; + + { + Writer.BeginObject(); + + const uint8_t LocalField[] = {uint8_t(CbFieldType::IntegerPositive | CbFieldType::HasFieldName), 1, 'I', 42}; + Writer.AddField("FieldCopy"sv, CbFieldView(LocalField)); + Writer.AddField("FieldRefCopy"sv, CbField(SharedBuffer::Clone(MakeMemoryView(LocalField)))); + + const uint8_t LocalObject[] = {uint8_t(CbFieldType::Object | CbFieldType::HasFieldName), + 1, + 'O', + 7, + uint8_t(CbFieldType::IntegerPositive | CbFieldType::HasFieldName), + 1, + 'I', + 42, + uint8_t(CbFieldType::Null | CbFieldType::HasFieldName), + 1, + 'N'}; + Writer.AddObject("ObjectCopy"sv, CbObjectView(LocalObject)); + Writer.AddObject("ObjectRefCopy"sv, CbObject(SharedBuffer::Clone(MakeMemoryView(LocalObject)))); + + const uint8_t LocalArray[] = {uint8_t(CbFieldType::UniformArray | CbFieldType::HasFieldName), + 1, + 'A', + 4, + 2, + uint8_t(CbFieldType::IntegerPositive), + 42, + 21}; + Writer.AddArray("ArrayCopy"sv, CbArrayView(LocalArray)); + Writer.AddArray("ArrayRefCopy"sv, CbArray(SharedBuffer::Clone(MakeMemoryView(LocalArray)))); + + Writer.AddNull("Null"sv); + + Writer.BeginObject("Binary"sv); + { + Writer.AddBinary("Empty"sv, MemoryView()); + Writer.AddBinary("Value"sv, MakeMemoryView("BinaryValue")); + Writer.AddBinary("LargeValue"sv, MakeMemoryView(std::wstring().append(256, L'.'))); + Writer.AddBinary("LargeRefValue"sv, SharedBuffer::Clone(MakeMemoryView(std::wstring().append(256, L'!')))); + } + Writer.EndObject(); + + Writer.BeginObject("Strings"sv); + { + Writer.AddString("AnsiString"sv, "AnsiValue"sv); + Writer.AddString("WideString"sv, std::wstring().append(256, L'.')); + Writer.AddString("EmptyAnsiString"sv, std::string_view()); + Writer.AddString("EmptyWideString"sv, std::wstring_view()); + Writer.AddString("AnsiStringLiteral", "AnsiValue"); + Writer.AddString("WideStringLiteral", L"AnsiValue"); + } + Writer.EndObject(); + + Writer.BeginArray("Integers"sv); + { + Writer.AddInteger(int32_t(-1)); + Writer.AddInteger(int64_t(-1)); + Writer.AddInteger(uint32_t(1)); + Writer.AddInteger(uint64_t(1)); + Writer.AddInteger(std::numeric_limits<int32_t>::min()); + Writer.AddInteger(std::numeric_limits<int32_t>::max()); + Writer.AddInteger(std::numeric_limits<uint32_t>::max()); + Writer.AddInteger(std::numeric_limits<int64_t>::min()); + Writer.AddInteger(std::numeric_limits<int64_t>::max()); + Writer.AddInteger(std::numeric_limits<uint64_t>::max()); + } + Writer.EndArray(); + + Writer.BeginArray("UniformIntegers"sv); + { + Writer.AddInteger(0); + Writer.AddInteger(std::numeric_limits<int32_t>::max()); + Writer.AddInteger(std::numeric_limits<uint32_t>::max()); + Writer.AddInteger(std::numeric_limits<int64_t>::max()); + Writer.AddInteger(std::numeric_limits<uint64_t>::max()); + } + Writer.EndArray(); + + Writer.AddFloat("Float32"sv, 1.0f); + Writer.AddFloat("Float64as32"sv, 2.0); + Writer.AddFloat("Float64"sv, 3.0e100); + + Writer.AddBool("False"sv, false); + Writer.AddBool("True"sv, true); + + Writer.AddCompactBinaryAttachment("CompactBinaryAttachment"sv, IoHash()); + Writer.AddBinaryAttachment("BinaryAttachment"sv, IoHash()); + Writer.AddAttachment("Attachment"sv, CbAttachment()); + + Writer.AddHash("Hash"sv, IoHash()); + Writer.AddUuid("Uuid"sv, Guid()); + + Writer.AddDateTimeTicks("DateTimeZero"sv, 0); + Writer.AddDateTime("DateTime2020"sv, DateTime(2020, 5, 13, 15, 10)); + + Writer.AddTimeSpanTicks("TimeSpanZero"sv, 0); + Writer.AddTimeSpan("TimeSpan"sv, TimeSpan(1, 2, 4, 8)); + + Writer.BeginObject("NestedObjects"sv); + { + Writer.BeginObject("Empty"sv); + Writer.EndObject(); + + Writer.BeginObject("Null"sv); + Writer.AddNull("Null"sv); + Writer.EndObject(); + } + Writer.EndObject(); + + Writer.BeginArray("NestedArrays"sv); + { + Writer.BeginArray(); + Writer.EndArray(); + + Writer.BeginArray(); + Writer.AddNull(); + Writer.AddNull(); + Writer.AddNull(); + Writer.EndArray(); + + Writer.BeginArray(); + Writer.AddBool(false); + Writer.AddBool(false); + Writer.AddBool(false); + Writer.EndArray(); + + Writer.BeginArray(); + Writer.AddBool(true); + Writer.AddBool(true); + Writer.AddBool(true); + Writer.EndArray(); + } + Writer.EndArray(); + + Writer.BeginArray("ArrayOfObjects"sv); + { + Writer.BeginObject(); + Writer.EndObject(); + + Writer.BeginObject(); + Writer.AddNull("Null"sv); + Writer.EndObject(); + } + Writer.EndArray(); + + Writer.BeginArray("LargeArray"sv); + for (int Index = 0; Index < 256; ++Index) + { + Writer.AddInteger(Index - 128); + } + Writer.EndArray(); + + Writer.BeginArray("LargeUniformArray"sv); + for (int Index = 0; Index < 256; ++Index) + { + Writer.AddInteger(Index); + } + Writer.EndArray(); + + Writer.BeginArray("NestedUniformArray"sv); + for (int Index = 0; Index < 16; ++Index) + { + Writer.BeginArray(); + for (int Value = 0; Value < 4; ++Value) + { + Writer.AddInteger(Value); + } + Writer.EndArray(); + } + Writer.EndArray(); + + Writer.EndObject(); + Object = Writer.Save().AsObject(); + } + CHECK(ValidateCompactBinary(Object.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + } +} + +TEST_CASE("usonbuilder.stream") +{ + using namespace std::literals; + + FixedCbWriter<256> Writer; + + SUBCASE("basic") + { + CbObject Object; + { + Writer.BeginObject(); + + const uint8_t LocalField[] = {uint8_t(CbFieldType::IntegerPositive | CbFieldType::HasFieldName), 1, 'I', 42}; + Writer << "FieldCopy"sv << CbFieldView(LocalField); + + const uint8_t LocalObject[] = {uint8_t(CbFieldType::Object | CbFieldType::HasFieldName), + 1, + 'O', + 7, + uint8_t(CbFieldType::IntegerPositive | CbFieldType::HasFieldName), + 1, + 'I', + 42, + uint8_t(CbFieldType::Null | CbFieldType::HasFieldName), + 1, + 'N'}; + Writer << "ObjectCopy"sv << CbObjectView(LocalObject); + + const uint8_t LocalArray[] = {uint8_t(CbFieldType::UniformArray | CbFieldType::HasFieldName), + 1, + 'A', + 4, + 2, + uint8_t(CbFieldType::IntegerPositive), + 42, + 21}; + Writer << "ArrayCopy"sv << CbArrayView(LocalArray); + + Writer << "Null"sv << nullptr; + + Writer << "Strings"sv; + Writer.BeginObject(); + Writer << "AnsiString"sv + << "AnsiValue"sv + << "AnsiStringLiteral"sv + << "AnsiValue" + << "WideString"sv << L"WideValue"sv << "WideStringLiteral"sv << L"WideValue"; + Writer.EndObject(); + + Writer << "Integers"sv; + Writer.BeginArray(); + Writer << int32_t(-1) << int64_t(-1) << uint32_t(1) << uint64_t(1); + Writer.EndArray(); + + Writer << "Float32"sv << 1.0f; + Writer << "Float64"sv << 2.0; + + Writer << "False"sv << false << "True"sv << true; + + Writer << "Attachment"sv << CbAttachment(); + + Writer << "Hash"sv << IoHash(); + Writer << "Uuid"sv << Guid(); + + Writer << "DateTime"sv << DateTime(2020, 5, 13, 15, 10); + Writer << "TimeSpan"sv << TimeSpan(1, 2, 4, 8); + + Writer << "LiteralName" << nullptr; + + Writer.EndObject(); + Object = Writer.Save().AsObject(); + } + + CHECK(ValidateCompactBinary(Object.GetBuffer(), CbValidateMode::All) == CbValidateError::None); + } +} + +} // namespace zen diff --git a/zencore/compactbinarypackage.cpp b/zencore/compactbinarypackage.cpp new file mode 100644 index 000000000..21883f5b1 --- /dev/null +++ b/zencore/compactbinarypackage.cpp @@ -0,0 +1,945 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/compactbinarypackage.h" +#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinaryvalidation.h> +#include <zencore/endian.h> +#include <zencore/stream.h> +#include <zencore/trace.h> + +#include <doctest/doctest.h> + +namespace zen { + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +CbAttachment::CbAttachment(CbFieldIterator InValue, const IoHash* const InHash) +{ + if (InValue) + { + if (!InValue.IsOwned()) + { + InValue = CbFieldIterator::CloneRange(InValue); + } + + CompactBinary = CbFieldViewIterator(InValue); + Buffer = std::move(InValue).GetOuterBuffer(); + } + + if (InHash) + { + Hash = *InHash; + if (CompactBinary) + { + ZEN_ASSERT_SLOW(Hash == CompactBinary.GetRangeHash()); + } + else + { +#if 0 + zenfs_assertSlow(Hash.IsZero(), TEXT("A null or empty field range must use a hash of zero.")); +#endif + } + } + else if (CompactBinary) + { + Hash = CompactBinary.GetRangeHash(); + } +} + +CbAttachment::CbAttachment(SharedBuffer InBuffer, const IoHash* const InHash) : Buffer(std::move(InBuffer)) +{ + Buffer.MakeOwned(); + if (InHash) + { + Hash = *InHash; + if (Buffer.GetSize()) + { + ZEN_ASSERT_SLOW(Hash == IoHash::HashMemory(Buffer.GetData(), Buffer.GetSize())); + } + else + { + ZEN_ASSERT_SLOW(Hash == IoHash::Zero, TEXT("A null or empty buffer must use a hash of zero.")); + } + } + else if (Buffer.GetSize()) + { + Hash = IoHash::HashMemory(Buffer.GetData(), Buffer.GetSize()); + } + else + { + Buffer.Reset(); + } +} + +SharedBuffer +CbAttachment::AsBinaryView() const +{ + if (!CompactBinary) + { + return Buffer; + } + + MemoryView SerializedView; + if (CompactBinary.TryGetSerializedRangeView(SerializedView)) + { + return SerializedView == Buffer.GetView() ? Buffer : SharedBuffer::MakeView(SerializedView, Buffer); + } + + return CbFieldIterator::CloneRange(CompactBinary).GetRangeBuffer(); +} + +CbFieldIterator +CbAttachment::AsCompactBinary() const +{ + return CompactBinary ? CbFieldIterator::MakeRangeView(CompactBinary, Buffer) : CbFieldIterator(); +} + +void +CbAttachment::Load(IoBuffer& InBuffer, BufferAllocator Allocator) +{ + MemoryInStream InStream(InBuffer.Data(), InBuffer.Size()); + BinaryReader Reader(InStream); + + Load(Reader, Allocator); +} + +void +CbAttachment::Load(CbFieldIterator& Fields) +{ + ZEN_ASSERT(Fields.IsBinary()); //, TEXT("Attachments must start with a binary field.")); + const MemoryView View = Fields.AsBinaryView(); + if (View.GetSize() > 0) + { + Buffer = SharedBuffer::MakeView(View, Fields.GetOuterBuffer()); + Buffer.MakeOwned(); + ++Fields; + Hash = Fields.AsAttachment(); + ZEN_ASSERT(!Fields.HasError()); // TEXT("Attachments must be a non-empty binary value with a content hash.")); + if (Fields.IsCompactBinaryAttachment()) + { + CompactBinary = CbFieldViewIterator::MakeRange(Buffer); + } + ++Fields; + } + else + { + ++Fields; + Buffer.Reset(); + CompactBinary.Reset(); + Hash = IoHash::Zero; + } +} + +void +CbAttachment::Load(BinaryReader& Reader, BufferAllocator Allocator) +{ + CbField BufferField = LoadCompactBinary(Reader, Allocator); + ZEN_ASSERT(BufferField.IsBinary(), "Attachments must start with a binary field"); + const MemoryView View = BufferField.AsBinaryView(); + if (View.GetSize() > 0) + { + Buffer = SharedBuffer::MakeView(View, BufferField.GetOuterBuffer()); + Buffer.MakeOwned(); + CompactBinary = CbFieldViewIterator(); + + std::vector<uint8_t> HashBuffer; + CbField HashField = LoadCompactBinary(Reader, [&HashBuffer](uint64_t Size) -> UniqueBuffer { + HashBuffer.resize(Size); + return UniqueBuffer::MakeView(HashBuffer.data(), Size); + }); + Hash = HashField.AsAttachment(); + ZEN_ASSERT(!HashField.HasError(), "Attachments must be a non-empty binary value with a content hash."); + if (HashField.IsCompactBinaryAttachment()) + { + CompactBinary = CbFieldViewIterator::MakeRange(Buffer); + } + } + else + { + Buffer.Reset(); + CompactBinary.Reset(); + Hash = IoHash::Zero; + } +} + +void +CbAttachment::Save(CbWriter& Writer) const +{ + if (CompactBinary) + { + MemoryView SerializedView; + if (CompactBinary.TryGetSerializedRangeView(SerializedView)) + { + Writer.AddBinary(SerializedView); + } + else + { + Writer.AddBinary(AsBinaryView()); + } + Writer.AddCompactBinaryAttachment(Hash); + } + else if (Buffer && Buffer.GetSize()) + { + Writer.AddBinary(Buffer); + Writer.AddBinaryAttachment(Hash); + } + else // Null + { + Writer.AddBinary(MemoryView()); + } +} + +void +CbAttachment::Save(BinaryWriter& Writer) const +{ + CbWriter TempWriter; + Save(TempWriter); + TempWriter.Save(Writer); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void +CbPackage::SetObject(CbObject InObject, const IoHash* InObjectHash, AttachmentResolver* InResolver) +{ + if (InObject.CreateIterator()) + { + Object = InObject.IsOwned() ? std::move(InObject) : CbObject::Clone(InObject); + if (InObjectHash) + { + ObjectHash = *InObjectHash; + ZEN_ASSERT_SLOW(ObjectHash == Object.GetHash()); + } + else + { + ObjectHash = Object.GetHash(); + } + if (InResolver) + { + GatherAttachments(Object.CreateIterator(), *InResolver); + } + } + else + { + Object.Reset(); + ObjectHash = IoHash::Zero; + } +} + +void +CbPackage::AddAttachment(const CbAttachment& Attachment, AttachmentResolver* Resolver) +{ + if (!Attachment.IsNull()) + { + auto It = std::lower_bound(begin(Attachments), end(Attachments), Attachment); + if (It != Attachments.end() && *It == Attachment) + { + CbAttachment& Existing = *It; + if (Attachment.IsCompactBinary() && !Existing.IsCompactBinary()) + { + Existing = CbAttachment(CbFieldIterator::MakeRange(Existing.AsBinaryView())); + } + } + else + { + Attachments.insert(It, Attachment); + } + + if (Attachment.IsCompactBinary() && Resolver) + { + GatherAttachments(Attachment.AsCompactBinary(), *Resolver); + } + } +} + +int32_t +CbPackage::RemoveAttachment(const IoHash& Hash) +{ + return gsl::narrow_cast<int32_t>( + std::erase_if(Attachments, [&Hash](const CbAttachment& Attachment) -> bool { return Attachment.GetHash() == Hash; })); +} + +bool +CbPackage::Equals(const CbPackage& Package) const +{ + return ObjectHash == Package.ObjectHash && Attachments == Package.Attachments; +} + +const CbAttachment* +CbPackage::FindAttachment(const IoHash& Hash) const +{ + auto It = std::find_if(begin(Attachments), end(Attachments), [&Hash](const CbAttachment& Attachment) -> bool { + return Attachment.GetHash() == Hash; + }); + + if (It == end(Attachments)) + return nullptr; + + return &*It; +} + +void +CbPackage::GatherAttachments(const CbFieldViewIterator& Fields, AttachmentResolver Resolver) +{ + Fields.IterateRangeAttachments([this, &Resolver](CbFieldView Field) { + const IoHash& Hash = Field.AsAttachment(); + + if (SharedBuffer Buffer = Resolver(Hash)) + { + if (Field.IsCompactBinaryAttachment()) + { + AddAttachment(CbAttachment(CbFieldIterator::MakeRange(std::move(Buffer)), Hash), &Resolver); + } + else + { + AddAttachment(CbAttachment(std::move(Buffer), Hash)); + } + } + }); +} + +void +CbPackage::Load(IoBuffer& InBuffer, BufferAllocator Allocator) +{ + MemoryInStream InStream(InBuffer.Data(), InBuffer.Size()); + BinaryReader Reader(InStream); + + Load(Reader, Allocator); +} + +void +CbPackage::Load(CbFieldIterator& Fields) +{ + *this = CbPackage(); + while (Fields) + { + if (Fields.IsNull()) + { + ++Fields; + break; + } + else if (Fields.IsBinary()) + { + CbAttachment Attachment; + Attachment.Load(Fields); + AddAttachment(Attachment); + } + else + { + ZEN_ASSERT(Fields.IsObject(), TEXT("Expected Object, Binary, or Null field when loading a package.")); + Object = Fields.AsObject(); + Object.MakeOwned(); + ++Fields; + if (Object.CreateIterator()) + { + ObjectHash = Fields.AsCompactBinaryAttachment(); + ZEN_ASSERT(!Fields.HasError(), TEXT("Object must be followed by a CompactBinaryReference with the object hash.")); + ++Fields; + } + else + { + Object.Reset(); + } + } + } +} + +void +CbPackage::Load(BinaryReader& Reader, BufferAllocator Allocator) +{ + uint8_t StackBuffer[64]; + const auto StackAllocator = [&Allocator, &StackBuffer](uint64_t Size) -> UniqueBuffer { + if (Size <= sizeof(StackBuffer)) + { + return UniqueBuffer::MakeView(StackBuffer, Size); + } + + return Allocator(Size); + }; + + *this = CbPackage(); + + for (;;) + { + CbField ValueField = LoadCompactBinary(Reader, StackAllocator); + if (ValueField.IsNull()) + { + break; + } + else if (ValueField.IsBinary()) + { + const MemoryView View = ValueField.AsBinaryView(); + if (View.GetSize() > 0) + { + SharedBuffer Buffer = SharedBuffer::MakeView(View, ValueField.GetOuterBuffer()); + Buffer.MakeOwned(); + CbField HashField = LoadCompactBinary(Reader, StackAllocator); + const IoHash& Hash = HashField.AsAttachment(); + ZEN_ASSERT(!HashField.HasError(), "Attachments must be a non-empty binary value with a content hash."); + if (HashField.IsCompactBinaryAttachment()) + { + AddAttachment(CbAttachment(CbFieldIterator::MakeRange(std::move(Buffer)), Hash)); + } + else + { + AddAttachment(CbAttachment(std::move(Buffer), Hash)); + } + } + } + else + { + ZEN_ASSERT(ValueField.IsObject(), "Expected Object, Binary, or Null field when loading a package"); + Object = ValueField.AsObject(); + Object.MakeOwned(); + if (Object.CreateViewIterator()) + { + CbField HashField = LoadCompactBinary(Reader, StackAllocator); + ObjectHash = HashField.AsCompactBinaryAttachment(); + ZEN_ASSERT(!HashField.HasError(), "Object must be followed by a CompactBinaryAttachment with the object hash."); + } + else + { + Object.Reset(); + } + } + } +} + +void +CbPackage::Save(CbWriter& Writer) const +{ + if (Object.CreateIterator()) + { + Writer.AddObject(Object); + Writer.AddCompactBinaryAttachment(ObjectHash); + } + for (const CbAttachment& Attachment : Attachments) + { + Attachment.Save(Writer); + } + Writer.AddNull(); +} + +void +CbPackage::Save(BinaryWriter& StreamWriter) const +{ + CbWriter Writer; + Save(Writer); + Writer.Save(StreamWriter); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void +usonpackage_forcelink() +{ +} + +TEST_CASE("usonpackage") +{ + using namespace std::literals; + + const auto TestSaveLoadValidate = [&](const char* Test, const CbAttachment& Attachment) { + ZEN_UNUSED(Test); + + CbWriter Writer; + Attachment.Save(Writer); + CbFieldIterator Fields = Writer.Save(); + + MemoryOutStream WriteStream; + BinaryWriter StreamWriter{WriteStream}; + Attachment.Save(StreamWriter); + + CHECK(MakeMemoryView(WriteStream).EqualBytes(Fields.GetRangeBuffer().GetView())); + CHECK(ValidateCompactBinaryRange(MakeMemoryView(WriteStream), CbValidateMode::All) == CbValidateError::None); + CHECK(ValidateCompactBinaryAttachment(MakeMemoryView(WriteStream), CbValidateMode::All) == CbValidateError::None); + + CbAttachment FromFields; + FromFields.Load(Fields); + CHECK(!bool(Fields)); + CHECK(FromFields == Attachment); + + CbAttachment FromArchive; + MemoryInStream InStream(MakeMemoryView(WriteStream)); + BinaryReader Reader(InStream); + FromArchive.Load(Reader); + CHECK(Reader.CurrentOffset() == InStream.Size()); + CHECK(FromArchive == Attachment); + }; + + SUBCASE("Empty Attachment") + { + CbAttachment Attachment; + CHECK(Attachment.IsNull()); + CHECK_FALSE(bool(Attachment)); + CHECK_FALSE(bool(Attachment.AsBinaryView())); + CHECK_FALSE(Attachment.AsCompactBinary().HasValue()); + CHECK_FALSE(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::Zero); + TestSaveLoadValidate("Null", Attachment); + } + + SUBCASE("Binary Attachment") + { + const SharedBuffer Buffer = SharedBuffer::Clone(MakeMemoryView<uint8_t>({0, 1, 2, 3})); + CbAttachment Attachment(Buffer); + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + CHECK(Attachment.AsBinaryView() == Buffer); + CHECK_FALSE(Attachment.AsCompactBinary().HasValue()); + CHECK(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::HashMemory(Buffer)); + TestSaveLoadValidate("Binary", Attachment); + } + + SUBCASE("Compact Binary Attachment") + { + CbWriter Writer; + Writer << "Name"sv << 42; + CbFieldIterator Fields = Writer.Save(); + CbAttachment Attachment(Fields); + + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + CHECK(Attachment.AsBinaryView() == Fields.GetRangeBuffer()); + CHECK(Attachment.AsCompactBinary() == Fields); + CHECK(Attachment.IsBinary()); + CHECK(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == Fields.GetRangeHash()); + TestSaveLoadValidate("CompactBinary", Attachment); + } + + SUBCASE("Binary View") + { + const uint8_t Value[]{0, 1, 2, 3}; + SharedBuffer Buffer = SharedBuffer::MakeView(MakeMemoryView(Value)); + CbAttachment Attachment(Buffer); + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + CHECK(Attachment.AsBinaryView().GetView().EqualBytes(Buffer.GetView())); + CHECK_FALSE(Attachment.AsCompactBinary().HasValue()); + CHECK(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::HashMemory(Buffer)); + } + + SUBCASE("Compact Binary View") + { + CbWriter Writer; + Writer << "Name"sv << 42; + CbFieldIterator Fields = Writer.Save(); + CbFieldIterator FieldsView = CbFieldIterator::MakeRangeView(CbFieldViewIterator(Fields)); + CbAttachment Attachment(FieldsView); + + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + + CHECK(Attachment.AsBinaryView() != FieldsView.GetRangeBuffer()); + + CHECK(Attachment.AsCompactBinary().GetRangeView().EqualBytes(Fields.GetRangeView())); + CHECK(Attachment.IsBinary()); + CHECK(Attachment.GetHash() == Fields.GetRangeHash()); + } + + SUBCASE("Binary Load from View") + { + const uint8_t Value[]{0, 1, 2, 3}; + const SharedBuffer Buffer = SharedBuffer::MakeView(MakeMemoryView(Value)); + CbAttachment Attachment(Buffer); + + CbWriter Writer; + Attachment.Save(Writer); + CbFieldIterator Fields = Writer.Save(); + CbFieldIterator FieldsView = CbFieldIterator::MakeRangeView(CbFieldViewIterator(Fields)); + Attachment.Load(FieldsView); + + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + CHECK_FALSE(FieldsView.GetRangeBuffer().GetView().Contains(Attachment.AsBinaryView().GetView())); + CHECK(Attachment.AsBinaryView().GetView().EqualBytes(Buffer.GetView())); + CHECK_FALSE(Attachment.AsCompactBinary().HasValue()); + CHECK(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::HashMemory(MakeMemoryView(Value))); + } + + SUBCASE("Compact Binary Load from View") + { + CbWriter ValueWriter; + ValueWriter << "Name"sv << 42; + const CbFieldIterator Value = ValueWriter.Save(); + + CHECK(ValidateCompactBinaryRange(Value.GetRangeView(), CbValidateMode::All) == CbValidateError::None); + CbAttachment Attachment(Value); + + CbWriter Writer; + Attachment.Save(Writer); + CbFieldIterator Fields = Writer.Save(); + CbFieldIterator FieldsView = CbFieldIterator::MakeRangeView(CbFieldViewIterator(Fields)); + + Attachment.Load(FieldsView); + + CHECK_FALSE(Attachment.IsNull()); + CHECK(bool(Attachment)); + + CHECK(Attachment.AsBinaryView().GetView().EqualBytes(Value.GetRangeView())); + CHECK_FALSE(FieldsView.GetRangeBuffer().GetView().Contains(Attachment.AsCompactBinary().GetRangeBuffer().GetView())); + CHECK(Attachment.IsBinary()); + CHECK(Attachment.IsCompactBinary()); + + CHECK(Attachment.GetHash() == Value.GetRangeHash()); + } + + SUBCASE("Compact Binary Uniform Sub-View") + { + const SharedBuffer Buffer = SharedBuffer::Clone(MakeMemoryView<uint8_t>({0, 1, 2, 3})); + const CbFieldViewIterator FieldViews = CbFieldViewIterator::MakeRange(Buffer.GetView().RightChop(2), CbFieldType::IntegerPositive); + const CbFieldIterator SavedFields = CbFieldIterator::CloneRange(FieldViews); + CbFieldIterator Fields = CbFieldIterator::MakeRangeView(FieldViews, Buffer); + CbAttachment Attachment(Fields); + const SharedBuffer Binary = Attachment.AsBinaryView(); + CHECK(Attachment.AsCompactBinary() == Fields); + CHECK(Binary.GetSize() == SavedFields.GetRangeSize()); + CHECK(Binary.GetView().EqualBytes(SavedFields.GetRangeView())); + CHECK(Attachment.GetHash() == SavedFields.GetRangeHash()); + TestSaveLoadValidate("CompactBinaryUniformSubView", Attachment); + } + + SUBCASE("Binary Null") + { + const CbAttachment Attachment(SharedBuffer{}); + + CHECK(Attachment.IsNull()); + CHECK_FALSE(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::Zero); + } + + SUBCASE("Binary Empty") + { + const CbAttachment Attachment(SharedBuffer(UniqueBuffer::Alloc(0))); + + CHECK(Attachment.IsNull()); + CHECK_FALSE(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::Zero); + } + + SUBCASE("Compact Binary Empty") + { + const CbAttachment Attachment(CbFieldIterator{}); + + CHECK(Attachment.IsNull()); + CHECK_FALSE(Attachment.IsBinary()); + CHECK_FALSE(Attachment.IsCompactBinary()); + CHECK(Attachment.GetHash() == IoHash::Zero); + } +} + +TEST_CASE("usonpackage.serialization") +{ + using namespace std::literals; + + const auto TestSaveLoadValidate = [&](const char* Test, const CbPackage& Package) { + ZEN_UNUSED(Test); + + CbWriter Writer; + Package.Save(Writer); + CbFieldIterator Fields = Writer.Save(); + + MemoryOutStream MemStream; + BinaryWriter WriteAr(MemStream); + Package.Save(WriteAr); + + CHECK(MakeMemoryView(MemStream).EqualBytes(Fields.GetRangeBuffer().GetView())); + CHECK(ValidateCompactBinaryRange(MakeMemoryView(MemStream), CbValidateMode::All) == CbValidateError::None); + CHECK(ValidateCompactBinaryPackage(MakeMemoryView(MemStream), CbValidateMode::All) == CbValidateError::None); + + CbPackage FromFields; + FromFields.Load(Fields); + CHECK_FALSE(bool(Fields)); + CHECK(FromFields == Package); + + CbPackage FromArchive; + MemoryInStream ReadMemStream(MakeMemoryView(MemStream)); + BinaryReader ReadAr(ReadMemStream); + FromArchive.Load(ReadAr); + CHECK(ReadAr.CurrentOffset() == ReadMemStream.Size()); + CHECK(FromArchive == Package); + }; + + SUBCASE("Empty") + { + CbPackage Package; + CHECK(Package.IsNull()); + CHECK_FALSE(bool(Package)); + CHECK(Package.GetAttachments().size() == 0); + TestSaveLoadValidate("Empty", Package); + } + + SUBCASE("Object Only") + { + CbWriter Writer; + Writer.BeginObject(); + Writer << "Field" << 42; + Writer.EndObject(); + + const CbObject Object = Writer.Save().AsObject(); + CbPackage Package(Object); + CHECK_FALSE(Package.IsNull()); + CHECK(bool(Package)); + CHECK(Package.GetAttachments().size() == 0); + CHECK(Package.GetObject().GetOuterBuffer() == Object.GetOuterBuffer()); + CHECK(Package.GetObject()["Field"].AsInt32() == 42); + CHECK(Package.GetObjectHash() == Package.GetObject().GetHash()); + TestSaveLoadValidate("Object", Package); + } + + // Object View Only + { + CbWriter Writer; + Writer.BeginObject(); + Writer << "Field" << 42; + Writer.EndObject(); + + const CbObject Object = Writer.Save().AsObject(); + CbPackage Package(CbObject::MakeView(Object)); + CHECK_FALSE(Package.IsNull()); + CHECK(bool(Package)); + CHECK(Package.GetAttachments().size() == 0); + CHECK(Package.GetObject().GetOuterBuffer() != Object.GetOuterBuffer()); + CHECK(Package.GetObject()["Field"].AsInt32() == 42); + CHECK(Package.GetObjectHash() == Package.GetObject().GetHash()); + TestSaveLoadValidate("Object", Package); + } + + // Attachment Only + { + CbObject Object; + { + CbWriter Writer; + Writer.BeginObject(); + Writer << "Field" << 42; + Writer.EndObject(); + Object = Writer.Save().AsObject(); + } + CbField Field = CbField::Clone(Object["Field"]); + + CbPackage Package; + Package.AddAttachment(CbAttachment(CbFieldIterator::MakeSingle(Object.AsField()))); + Package.AddAttachment(CbAttachment(Field.GetBuffer())); + + CHECK_FALSE(Package.IsNull()); + CHECK(bool(Package)); + CHECK(Package.GetAttachments().size() == 2); + CHECK(Package.GetObject().Equals(CbObject())); + CHECK(Package.GetObjectHash() == IoHash()); + TestSaveLoadValidate("Attachments", Package); + + const CbAttachment* const ObjectAttachment = Package.FindAttachment(Object.GetHash()); + REQUIRE(ObjectAttachment); + + const CbAttachment* const FieldAttachment = Package.FindAttachment(Field.GetHash()); + REQUIRE(FieldAttachment); + + CHECK(ObjectAttachment->AsCompactBinary().AsObject().Equals(Object)); + CHECK(FieldAttachment->AsBinaryView() == Field.GetBuffer()); + + Package.AddAttachment(CbAttachment(SharedBuffer::Clone(Object.GetView()))); + Package.AddAttachment(CbAttachment(CbFieldIterator::CloneRange(CbFieldViewIterator::MakeSingle(Field)))); + + CHECK(Package.GetAttachments().size() == 2); + CHECK(Package.FindAttachment(Object.GetHash()) == ObjectAttachment); + CHECK(Package.FindAttachment(Field.GetHash()) == FieldAttachment); + + CHECK(ObjectAttachment->AsCompactBinary().AsObject().Equals(Object)); + CHECK(ObjectAttachment->AsBinaryView() == Object.GetBuffer()); + CHECK(FieldAttachment->AsCompactBinary().Equals(Field)); + CHECK(FieldAttachment->AsBinaryView() == Field.GetBuffer()); + + CHECK(std::is_sorted(begin(Package.GetAttachments()), end(Package.GetAttachments()))); + } + + // Shared Values + const uint8_t Level4Values[]{0, 1, 2, 3}; + SharedBuffer Level4 = SharedBuffer::MakeView(MakeMemoryView(Level4Values)); + const IoHash Level4Hash = IoHash::HashMemory(Level4); + + CbField Level3; + { + CbWriter Writer; + Writer.SetName("Level4").AddBinaryAttachment(Level4Hash); + Level3 = Writer.Save(); + } + const IoHash Level3Hash = Level3.GetHash(); + + CbArray Level2; + { + CbWriter Writer; + Writer.SetName("Level3"); + Writer.BeginArray(); + Writer.AddCompactBinaryAttachment(Level3Hash); + Writer.EndArray(); + Level2 = Writer.Save().AsArray(); + } + const IoHash Level2Hash = Level2.AsFieldView().GetHash(); + + CbObject Level1; + { + CbWriter Writer; + Writer.BeginObject(); + Writer.SetName("Level2").AddCompactBinaryAttachment(Level2Hash); + Writer.EndObject(); + Level1 = Writer.Save().AsObject(); + } + const IoHash Level1Hash = Level1.AsFieldView().GetHash(); + + const auto Resolver = [&Level2, &Level2Hash, &Level3, &Level3Hash, &Level4, &Level4Hash](const IoHash& Hash) -> SharedBuffer { + return Hash == Level2Hash ? Level2.GetBuffer() + : Hash == Level3Hash ? Level3.GetBuffer() + : Hash == Level4Hash ? Level4 + : SharedBuffer(); + }; + + // Object + Attachments + { + CbPackage Package; + Package.SetObject(Level1, Level1Hash, Resolver); + + CHECK_FALSE(Package.IsNull()); + CHECK(bool(Package)); + CHECK(Package.GetAttachments().size() == 3); + CHECK(Package.GetObject().GetBuffer() == Level1.GetBuffer()); + CHECK(Package.GetObjectHash() == Level1Hash); + TestSaveLoadValidate("Object+Attachments", Package); + + const CbAttachment* const Level2Attachment = Package.FindAttachment(Level2Hash); + const CbAttachment* const Level3Attachment = Package.FindAttachment(Level3Hash); + const CbAttachment* const Level4Attachment = Package.FindAttachment(Level4Hash); + CHECK((Level2Attachment && Level2Attachment->AsCompactBinary().AsArray().Equals(Level2))); + CHECK((Level3Attachment && Level3Attachment->AsCompactBinary().Equals(Level3))); + CHECK((Level4Attachment && Level4Attachment->AsBinaryView() != Level4 && + Level4Attachment->AsBinaryView().GetView().EqualBytes(Level4.GetView()))); + + CHECK(std::is_sorted(begin(Package.GetAttachments()), end(Package.GetAttachments()))); + + const CbPackage PackageCopy = Package; + CHECK(PackageCopy == Package); + + CHECK(Package.RemoveAttachment(Level1Hash) == 0); + CHECK(Package.RemoveAttachment(Level2Hash) == 1); + CHECK(Package.RemoveAttachment(Level3Hash) == 1); + CHECK(Package.RemoveAttachment(Level4Hash) == 1); + CHECK(Package.RemoveAttachment(Level4Hash) == 0); + CHECK(Package.GetAttachments().size() == 0); + + CHECK(PackageCopy != Package); + Package = PackageCopy; + CHECK(PackageCopy == Package); + Package.SetObject(CbObject()); + CHECK(PackageCopy != Package); + CHECK(Package.GetObjectHash() == IoHash()); + } + + // Out of Order + { + CbWriter Writer; + Writer.AddBinary(Level2.GetBuffer()); + Writer.AddCompactBinaryAttachment(Level2Hash); + Writer.AddBinary(Level4); + Writer.AddBinaryAttachment(Level4Hash); + Writer.AddObject(Level1); + Writer.AddCompactBinaryAttachment(Level1Hash); + Writer.AddBinary(Level3.GetBuffer()); + Writer.AddCompactBinaryAttachment(Level3Hash); + Writer.AddNull(); + + CbFieldIterator Fields = Writer.Save(); + CbPackage FromFields; + FromFields.Load(Fields); + + const CbAttachment* const Level2Attachment = FromFields.FindAttachment(Level2Hash); + REQUIRE(Level2Attachment); + const CbAttachment* const Level3Attachment = FromFields.FindAttachment(Level3Hash); + REQUIRE(Level3Attachment); + const CbAttachment* const Level4Attachment = FromFields.FindAttachment(Level4Hash); + REQUIRE(Level4Attachment); + + CHECK(FromFields.GetObject().Equals(Level1)); + CHECK(FromFields.GetObject().GetOuterBuffer() == Fields.GetOuterBuffer()); + CHECK(FromFields.GetObjectHash() == Level1Hash); + + const MemoryView FieldsOuterBufferView = Fields.GetOuterBuffer().GetView(); + + CHECK(Level2Attachment->AsCompactBinary().AsArray().Equals(Level2)); + CHECK(FieldsOuterBufferView.Contains(Level2Attachment->AsBinaryView().GetView())); + CHECK(Level2Attachment->GetHash() == Level2Hash); + + CHECK(Level3Attachment->AsCompactBinary().Equals(Level3)); + CHECK(FieldsOuterBufferView.Contains(Level3Attachment->AsBinaryView().GetView())); + CHECK(Level3Attachment->GetHash() == Level3Hash); + + CHECK(Level4Attachment->AsBinaryView().GetView().EqualBytes(Level4.GetView())); + CHECK(FieldsOuterBufferView.Contains(Level4Attachment->AsBinaryView().GetView())); + CHECK(Level4Attachment->GetHash() == Level4Hash); + + MemoryOutStream WriteStream; + BinaryWriter WriteAr(WriteStream); + Writer.Save(WriteAr); + CbPackage FromArchive; + MemoryInStream ReadStream(MakeMemoryView(WriteStream)); + BinaryReader ReadAr(ReadStream); + FromArchive.Load(ReadAr); + + Writer.Reset(); + FromArchive.Save(Writer); + CbFieldIterator Saved = Writer.Save(); + CHECK(Saved.AsObject().Equals(Level1)); + ++Saved; + CHECK(Saved.AsCompactBinaryAttachment() == Level1Hash); + ++Saved; + CHECK(Saved.AsBinaryView().EqualBytes(Level2.GetView())); + ++Saved; + CHECK(Saved.AsCompactBinaryAttachment() == Level2Hash); + ++Saved; + CHECK(Saved.AsBinaryView().EqualBytes(Level3.GetView())); + ++Saved; + CHECK(Saved.AsCompactBinaryAttachment() == Level3Hash); + ++Saved; + CHECK(Saved.AsBinaryView().EqualBytes(Level4.GetView())); + ++Saved; + CHECK(Saved.AsBinaryAttachment() == Level4Hash); + ++Saved; + CHECK(Saved.IsNull()); + ++Saved; + CHECK(!Saved); + } + + // Null Attachment + { + const CbAttachment NullAttachment; + CbPackage Package; + Package.AddAttachment(NullAttachment); + CHECK(Package.IsNull()); + CHECK_FALSE(bool(Package)); + CHECK(Package.GetAttachments().size() == 0); + CHECK_FALSE(Package.FindAttachment(NullAttachment)); + } + + // Resolve After Merge + { + bool bResolved = false; + CbPackage Package; + Package.AddAttachment(CbAttachment(Level3.GetBuffer())); + Package.AddAttachment(CbAttachment(CbFieldIterator::MakeSingle(Level3)), [&bResolved](const IoHash& Hash) -> SharedBuffer { + ZEN_UNUSED(Hash); + bResolved = true; + return SharedBuffer(); + }); + CHECK(bResolved); + } +} + +} // namespace zen diff --git a/zencore/compactbinaryvalidation.cpp b/zencore/compactbinaryvalidation.cpp new file mode 100644 index 000000000..51ed31e95 --- /dev/null +++ b/zencore/compactbinaryvalidation.cpp @@ -0,0 +1,607 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/compactbinaryvalidation.h" + +#include <zencore/compactbinarypackage.h> +#include <zencore/endian.h> +#include <zencore/memory.h> +#include <zencore/string.h> + +#include <algorithm> + +#include <doctest/doctest.h> + +namespace zen { + +namespace CbValidationPrivate { + + template<typename T> + static constexpr inline T ReadUnaligned(const void* const Memory) + { +#if PLATFORM_SUPPORTS_UNALIGNED_LOADS + return *static_cast<const T*>(Memory); +#else + T Value; + memcpy(&Value, Memory, sizeof(Value)); + return Value; +#endif + } + +} // namespace CbValidationPrivate + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Adds the given error(s) to the error mask. + * + * This function exists to make validation errors easier to debug by providing one location to set a breakpoint. + */ +ZEN_NOINLINE static void +AddError(CbValidateError& OutError, const CbValidateError InError) +{ + OutError |= InError; +} + +/** + * Validate and read a field type from the view. + * + * A type argument with the HasFieldType flag indicates that the type will not be read from the view. + */ +static CbFieldType +ValidateCbFieldType(MemoryView& View, CbValidateMode Mode, CbValidateError& Error, CbFieldType Type = CbFieldType::HasFieldType) +{ + ZEN_UNUSED(Mode); + if (CbFieldTypeOps::HasFieldType(Type)) + { + if (View.GetSize() >= 1) + { + Type = *static_cast<const CbFieldType*>(View.GetData()); + View += 1; + if (CbFieldTypeOps::HasFieldType(Type)) + { + AddError(Error, CbValidateError::InvalidType); + } + } + else + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + return CbFieldType::None; + } + } + + if (CbFieldTypeOps::GetSerializedType(Type) != Type) + { + AddError(Error, CbValidateError::InvalidType); + View.Reset(); + } + + return Type; +} + +/** + * Validate and read an unsigned integer from the view. + * + * Modifies the view to start at the end of the value, and adds error flags if applicable. + */ +static uint64_t +ValidateCbUInt(MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + if (View.GetSize() > 0 && View.GetSize() >= MeasureVarUInt(View.GetData())) + { + uint32_t ValueByteCount; + const uint64_t Value = ReadVarUInt(View.GetData(), ValueByteCount); + if (EnumHasAnyFlags(Mode, CbValidateMode::Format) && ValueByteCount > MeasureVarUInt(Value)) + { + AddError(Error, CbValidateError::InvalidInteger); + } + View += ValueByteCount; + return Value; + } + else + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + return 0; + } +} + +/** + * Validate a 64-bit floating point value from the view. + * + * Modifies the view to start at the end of the value, and adds error flags if applicable. + */ +static void +ValidateCbFloat64(MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + if (View.GetSize() >= sizeof(double)) + { + if (EnumHasAnyFlags(Mode, CbValidateMode::Format)) + { + const uint64_t RawValue = FromNetworkOrder(CbValidationPrivate::ReadUnaligned<uint64_t>(View.GetData())); + const double Value = reinterpret_cast<const double&>(RawValue); + if (Value == double(float(Value))) + { + AddError(Error, CbValidateError::InvalidFloat); + } + } + View += sizeof(double); + } + else + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + } +} + +/** + * Validate and read a string from the view. + * + * Modifies the view to start at the end of the string, and adds error flags if applicable. + */ +static std::string_view +ValidateCbString(MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + const uint64_t NameSize = ValidateCbUInt(View, Mode, Error); + if (View.GetSize() >= NameSize) + { + const std::string_view Name(static_cast<const char*>(View.GetData()), static_cast<int32_t>(NameSize)); + View += NameSize; + return Name; + } + else + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + return std::string_view(); + } +} + +static CbFieldView ValidateCbField(MemoryView& View, CbValidateMode Mode, CbValidateError& Error, CbFieldType ExternalType); + +/** A type that checks whether all validated fields are of the same type. */ +class CbUniformFieldsValidator +{ +public: + inline explicit CbUniformFieldsValidator(CbFieldType InExternalType) : ExternalType(InExternalType) {} + + inline CbFieldView ValidateField(MemoryView& View, CbValidateMode Mode, CbValidateError& Error) + { + const void* const FieldData = View.GetData(); + if (CbFieldView Field = ValidateCbField(View, Mode, Error, ExternalType)) + { + ++FieldCount; + if (CbFieldTypeOps::HasFieldType(ExternalType)) + { + const CbFieldType FieldType = *static_cast<const CbFieldType*>(FieldData); + if (FieldCount == 1) + { + FirstType = FieldType; + } + else if (FieldType != FirstType) + { + bUniform = false; + } + } + return Field; + } + + // It may not safe to check for uniformity if the field was invalid. + bUniform = false; + return CbFieldView(); + } + + inline bool IsUniform() const { return FieldCount > 0 && bUniform; } + +private: + uint32_t FieldCount = 0; + bool bUniform = true; + CbFieldType FirstType = CbFieldType::None; + CbFieldType ExternalType; +}; + +static void +ValidateCbObject(MemoryView& View, CbValidateMode Mode, CbValidateError& Error, CbFieldType ObjectType) +{ + const uint64_t Size = ValidateCbUInt(View, Mode, Error); + MemoryView ObjectView = View.Left(Size); + View += Size; + + if (Size > 0) + { + std::vector<std::string_view> Names; + + const bool bUniformObject = CbFieldTypeOps::GetType(ObjectType) == CbFieldType::UniformObject; + const CbFieldType ExternalType = bUniformObject ? ValidateCbFieldType(ObjectView, Mode, Error) : CbFieldType::HasFieldType; + CbUniformFieldsValidator UniformValidator(ExternalType); + do + { + if (CbFieldView Field = UniformValidator.ValidateField(ObjectView, Mode, Error)) + { + if (EnumHasAnyFlags(Mode, CbValidateMode::Names)) + { + if (Field.HasName()) + { + Names.push_back(Field.GetName()); + } + else + { + AddError(Error, CbValidateError::MissingName); + } + } + } + } while (!ObjectView.IsEmpty()); + + if (EnumHasAnyFlags(Mode, CbValidateMode::Names) && Names.size() > 1) + { + std::sort(begin(Names), end(Names), [](std::string_view L, std::string_view R) { return L.compare(R) < 0; }); + + for (const std::string_view *NamesIt = Names.data(), *NamesEnd = NamesIt + Names.size() - 1; NamesIt != NamesEnd; ++NamesIt) + { + if (NamesIt[0] == NamesIt[1]) + { + AddError(Error, CbValidateError::DuplicateName); + break; + } + } + } + + if (!bUniformObject && EnumHasAnyFlags(Mode, CbValidateMode::Format) && UniformValidator.IsUniform()) + { + AddError(Error, CbValidateError::NonUniformObject); + } + } +} + +static void +ValidateCbArray(MemoryView& View, CbValidateMode Mode, CbValidateError& Error, CbFieldType ArrayType) +{ + const uint64_t Size = ValidateCbUInt(View, Mode, Error); + MemoryView ArrayView = View.Left(Size); + View += Size; + + const uint64_t Count = ValidateCbUInt(ArrayView, Mode, Error); + const uint64_t FieldsSize = ArrayView.GetSize(); + const bool bUniformArray = CbFieldTypeOps::GetType(ArrayType) == CbFieldType::UniformArray; + const CbFieldType ExternalType = bUniformArray ? ValidateCbFieldType(ArrayView, Mode, Error) : CbFieldType::HasFieldType; + CbUniformFieldsValidator UniformValidator(ExternalType); + + for (uint64_t Index = 0; Index < Count; ++Index) + { + if (CbFieldView Field = UniformValidator.ValidateField(ArrayView, Mode, Error)) + { + if (Field.HasName() && EnumHasAnyFlags(Mode, CbValidateMode::Names)) + { + AddError(Error, CbValidateError::ArrayName); + } + } + } + + if (!bUniformArray && EnumHasAnyFlags(Mode, CbValidateMode::Format) && UniformValidator.IsUniform() && FieldsSize > Count) + { + AddError(Error, CbValidateError::NonUniformArray); + } +} + +static CbFieldView +ValidateCbField(MemoryView& View, CbValidateMode Mode, CbValidateError& Error, const CbFieldType ExternalType = CbFieldType::HasFieldType) +{ + const MemoryView FieldView = View; + const CbFieldType Type = ValidateCbFieldType(View, Mode, Error, ExternalType); + const std::string_view Name = CbFieldTypeOps::HasFieldName(Type) ? ValidateCbString(View, Mode, Error) : std::string_view(); + + auto ValidateFixedPayload = [&View, &Error](uint32_t PayloadSize) { + if (View.GetSize() >= PayloadSize) + { + View += PayloadSize; + } + else + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + } + }; + + if (EnumHasAnyFlags(Error, CbValidateError::OutOfBounds | CbValidateError::InvalidType)) + { + return CbFieldView(); + } + + switch (CbFieldType FieldType = CbFieldTypeOps::GetType(Type)) + { + default: + case CbFieldType::None: + AddError(Error, CbValidateError::InvalidType); + View.Reset(); + break; + case CbFieldType::Null: + case CbFieldType::BoolFalse: + case CbFieldType::BoolTrue: + if (FieldView == View) + { + // Reset the view because a zero-sized field can cause infinite field iteration. + AddError(Error, CbValidateError::InvalidType); + View.Reset(); + } + break; + case CbFieldType::Object: + case CbFieldType::UniformObject: + ValidateCbObject(View, Mode, Error, FieldType); + break; + case CbFieldType::Array: + case CbFieldType::UniformArray: + ValidateCbArray(View, Mode, Error, FieldType); + break; + case CbFieldType::Binary: + { + const uint64_t ValueSize = ValidateCbUInt(View, Mode, Error); + if (View.GetSize() < ValueSize) + { + AddError(Error, CbValidateError::OutOfBounds); + View.Reset(); + } + else + { + View += ValueSize; + } + break; + } + case CbFieldType::String: + ValidateCbString(View, Mode, Error); + break; + case CbFieldType::IntegerPositive: + ValidateCbUInt(View, Mode, Error); + break; + case CbFieldType::IntegerNegative: + ValidateCbUInt(View, Mode, Error); + break; + case CbFieldType::Float32: + ValidateFixedPayload(4); + break; + case CbFieldType::Float64: + ValidateCbFloat64(View, Mode, Error); + break; + case CbFieldType::CompactBinaryAttachment: + case CbFieldType::BinaryAttachment: + case CbFieldType::Hash: + ValidateFixedPayload(20); + break; + case CbFieldType::Uuid: + ValidateFixedPayload(16); + break; + case CbFieldType::DateTime: + case CbFieldType::TimeSpan: + ValidateFixedPayload(8); + break; + case CbFieldType::ObjectId: + ValidateFixedPayload(12); + break; + case CbFieldType::CustomById: + case CbFieldType::CustomByName: + ZEN_NOT_IMPLEMENTED(); // TODO: FIX! + break; + } + + if (EnumHasAnyFlags(Error, CbValidateError::OutOfBounds | CbValidateError::InvalidType)) + { + return CbFieldView(); + } + + return CbFieldView(FieldView.GetData(), ExternalType); +} + +static CbFieldView +ValidateCbPackageField(MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + if (View.IsEmpty()) + { + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + return CbFieldView(); + } + if (CbFieldView Field = ValidateCbField(View, Mode, Error)) + { + if (Field.HasName() && EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + return Field; + } + return CbFieldView(); +} + +static IoHash +ValidateCbPackageAttachment(CbFieldView& Value, MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + const MemoryView ValueView = Value.AsBinaryView(); + if (Value.HasError() && EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + } + else if (ValueView.GetSize()) + { + if (CbFieldView HashField = ValidateCbPackageField(View, Mode, Error)) + { + const IoHash Hash = HashField.AsAttachment(); + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + if (HashField.HasError()) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + else if (Hash != IoHash::HashMemory(ValueView.GetData(), ValueView.GetSize())) + { + AddError(Error, CbValidateError::InvalidPackageHash); + } + } + return Hash; + } + } + return IoHash(); +} + +static IoHash +ValidateCbPackageObject(CbFieldView& Value, MemoryView& View, CbValidateMode Mode, CbValidateError& Error) +{ + CbObjectView Object = Value.AsObjectView(); + if (Value.HasError()) + { + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + } + else if (CbFieldView HashField = ValidateCbPackageField(View, Mode, Error)) + { + const IoHash Hash = HashField.AsAttachment(); + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + if (!Object.CreateViewIterator()) + { + AddError(Error, CbValidateError::NullPackageObject); + } + if (HashField.HasError()) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + else if (Hash != Value.GetHash()) + { + AddError(Error, CbValidateError::InvalidPackageHash); + } + } + return Hash; + } + return IoHash(); +} + +CbValidateError +ValidateCompactBinary(MemoryView View, CbValidateMode Mode, CbFieldType Type) +{ + CbValidateError Error = CbValidateError::None; + if (EnumHasAnyFlags(Mode, CbValidateMode::All)) + { + ValidateCbField(View, Mode, Error, Type); + if (!View.IsEmpty() && EnumHasAnyFlags(Mode, CbValidateMode::Padding)) + { + AddError(Error, CbValidateError::Padding); + } + } + return Error; +} + +CbValidateError +ValidateCompactBinaryRange(MemoryView View, CbValidateMode Mode) +{ + CbValidateError Error = CbValidateError::None; + if (EnumHasAnyFlags(Mode, CbValidateMode::All)) + { + while (!View.IsEmpty()) + { + ValidateCbField(View, Mode, Error); + } + } + return Error; +} + +CbValidateError +ValidateCompactBinaryAttachment(MemoryView View, CbValidateMode Mode) +{ + CbValidateError Error = CbValidateError::None; + if (EnumHasAnyFlags(Mode, CbValidateMode::All)) + { + if (CbFieldView Value = ValidateCbPackageField(View, Mode, Error)) + { + ValidateCbPackageAttachment(Value, View, Mode, Error); + } + if (!View.IsEmpty() && EnumHasAnyFlags(Mode, CbValidateMode::Padding)) + { + AddError(Error, CbValidateError::Padding); + } + } + return Error; +} + +CbValidateError +ValidateCompactBinaryPackage(MemoryView View, CbValidateMode Mode) +{ + std::vector<IoHash> Attachments; + CbValidateError Error = CbValidateError::None; + if (EnumHasAnyFlags(Mode, CbValidateMode::All)) + { + uint32_t ObjectCount = 0; + while (CbFieldView Value = ValidateCbPackageField(View, Mode, Error)) + { + if (Value.IsBinary()) + { + const IoHash Hash = ValidateCbPackageAttachment(Value, View, Mode, Error); + if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + Attachments.push_back(Hash); + if (Value.AsBinaryView().IsEmpty()) + { + AddError(Error, CbValidateError::NullPackageAttachment); + } + } + } + else if (Value.IsObject()) + { + ValidateCbPackageObject(Value, View, Mode, Error); + if (++ObjectCount > 1 && EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::MultiplePackageObjects); + } + } + else if (Value.IsNull()) + { + break; + } + else if (EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + AddError(Error, CbValidateError::InvalidPackageFormat); + } + + if (EnumHasAnyFlags(Error, CbValidateError::OutOfBounds)) + { + break; + } + } + + if (!View.IsEmpty() && EnumHasAnyFlags(Mode, CbValidateMode::Padding)) + { + AddError(Error, CbValidateError::Padding); + } + + if (Attachments.size() && EnumHasAnyFlags(Mode, CbValidateMode::Package)) + { + std::sort(begin(Attachments), end(Attachments)); + for (const IoHash *It = Attachments.data(), *End = It + Attachments.size() - 1; It != End; ++It) + { + if (It[0] == It[1]) + { + AddError(Error, CbValidateError::DuplicateAttachments); + break; + } + } + } + } + return Error; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void +usonvalidation_forcelink() +{ +} + +TEST_CASE("usonvalidation") +{ + SUBCASE("Basic") {} +} + +} // namespace zen diff --git a/zencore/compress.cpp b/zencore/compress.cpp new file mode 100644 index 000000000..0a9c43949 --- /dev/null +++ b/zencore/compress.cpp @@ -0,0 +1,11 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/compress.h> + +#include <doctest/doctest.h> +#include <lz4.h> +#include <functional> + +namespace zen { + +} // namespace zen diff --git a/zencore/except.cpp b/zencore/except.cpp new file mode 100644 index 000000000..b02122f58 --- /dev/null +++ b/zencore/except.cpp @@ -0,0 +1,17 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/except.h> + +namespace zen { + +void +ThrowSystemException([[maybe_unused]] HRESULT hRes, [[maybe_unused]] const char* Message) +{ + // TODO + + int ErrValue = hRes; + + throw std::system_error(ErrValue, std::system_category(), Message); +} + +} // namespace zen diff --git a/zencore/filesystem.cpp b/zencore/filesystem.cpp new file mode 100644 index 000000000..663a626a1 --- /dev/null +++ b/zencore/filesystem.cpp @@ -0,0 +1,592 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/filesystem.h> + +#include <zencore/except.h> +#include <zencore/fmtutils.h> +#include <zencore/iobuffer.h> +#include <zencore/string.h> +#include <zencore/windows.h> + +#include <atlbase.h> +#include <atlfile.h> +#include <winioctl.h> +#include <winnt.h> +#include <filesystem> + +#include <spdlog/spdlog.h> + +#include <gsl/gsl-lite.hpp> + +namespace zen { + +using namespace std::literals; + +static bool +DeleteReparsePoint(const wchar_t* Path, DWORD dwReparseTag) +{ + CHandle hDir(CreateFileW(Path, + GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS | FILE_FLAG_OPEN_REPARSE_POINT, + nullptr)); + + if (hDir != INVALID_HANDLE_VALUE) + { + REPARSE_GUID_DATA_BUFFER Rgdb = {}; + Rgdb.ReparseTag = dwReparseTag; + + DWORD dwBytes; + const BOOL bOK = + DeviceIoControl(hDir, FSCTL_DELETE_REPARSE_POINT, &Rgdb, REPARSE_GUID_DATA_BUFFER_HEADER_SIZE, nullptr, 0, &dwBytes, nullptr); + + return bOK == TRUE; + } + + return false; +} + +bool +CreateDirectories(const wchar_t* Dir) +{ + return std::filesystem::create_directories(Dir); +} + +bool +CreateDirectories(const std::filesystem::path& Dir) +{ + return std::filesystem::create_directories(Dir); +} + +// Erase all files and directories in a given directory, leaving an empty directory +// behind + +static bool +WipeDirectory(const wchar_t* DirPath) +{ + ExtendableWideStringBuilder<128> Pattern; + Pattern.Append(DirPath); + Pattern.Append(L"\\*"); + + WIN32_FIND_DATAW FindData; + HANDLE hFind = FindFirstFileW(Pattern.c_str(), &FindData); + + bool AllOk = true; + + if (hFind != nullptr) + { + do + { + bool IsRegular = true; + + if (FindData.cFileName[0] == L'.') + { + if (FindData.cFileName[1] == L'.') + { + if (FindData.cFileName[2] == L'\0') + { + IsRegular = false; + } + } + else if (FindData.cFileName[1] == L'\0') + { + IsRegular = false; + } + } + + if (IsRegular) + { + ExtendableWideStringBuilder<128> Path; + Path.Append(DirPath); + Path.Append(L'\\'); + Path.Append(FindData.cFileName); + + // if (fd.dwFileAttributes & FILE_ATTRIBUTE_RECALL_ON_OPEN) + // deleteReparsePoint(path.c_str(), fd.dwReserved0); + + if (FindData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) + { + if (FindData.dwFileAttributes & FILE_ATTRIBUTE_RECALL_ON_OPEN) + { + DeleteReparsePoint(Path.c_str(), FindData.dwReserved0); + } + + if (FindData.dwFileAttributes & FILE_ATTRIBUTE_RECALL_ON_DATA_ACCESS) + { + DeleteReparsePoint(Path.c_str(), FindData.dwReserved0); + } + + bool Success = DeleteDirectories(Path.c_str()); + + if (!Success) + { + if (FindData.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) + { + DeleteReparsePoint(Path.c_str(), FindData.dwReserved0); + } + } + } + else + { + DeleteFileW(Path.c_str()); + } + } + } while (FindNextFileW(hFind, &FindData) == TRUE); + } + + FindClose(hFind); + + return true; +} + +bool +DeleteDirectories(const wchar_t* DirPath) +{ + return WipeDirectory(DirPath) && RemoveDirectoryW(DirPath) == TRUE; +} + +bool +CleanDirectory(const wchar_t* DirPath) +{ + if (std::filesystem::exists(DirPath)) + { + return WipeDirectory(DirPath); + } + else + { + return CreateDirectories(DirPath); + } +} + +bool +DeleteDirectories(const std::filesystem::path& Dir) +{ + return DeleteDirectories(Dir.c_str()); +} + +bool +CleanDirectory(const std::filesystem::path& Dir) +{ + return CleanDirectory(Dir.c_str()); +} + +////////////////////////////////////////////////////////////////////////// + +bool +SupportsBlockRefCounting(std::filesystem::path Path) +{ + ATL::CHandle Handle(CreateFileW(Path.c_str(), + GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, + nullptr)); + + if (Handle == INVALID_HANDLE_VALUE) + { + Handle.Detach(); + return false; + } + + ULONG FileSystemFlags = 0; + if (!GetVolumeInformationByHandleW(Handle, nullptr, 0, nullptr, nullptr, /* lpFileSystemFlags */ &FileSystemFlags, nullptr, 0)) + { + return false; + } + + if (!(FileSystemFlags & FILE_SUPPORTS_BLOCK_REFCOUNTING)) + { + return false; + } + + return true; +} + +bool +CloneFile(std::filesystem::path FromPath, std::filesystem::path ToPath) +{ + ATL::CHandle FromFile(CreateFileW(FromPath.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, 0, nullptr)); + if (FromFile == INVALID_HANDLE_VALUE) + { + FromFile.Detach(); + return false; + } + + ULONG FileSystemFlags; + if (!GetVolumeInformationByHandleW(FromFile, nullptr, 0, nullptr, nullptr, /* lpFileSystemFlags */ &FileSystemFlags, nullptr, 0)) + { + return false; + } + if (!(FileSystemFlags & FILE_SUPPORTS_BLOCK_REFCOUNTING)) + { + SetLastError(ERROR_NOT_CAPABLE); + return false; + } + + FILE_END_OF_FILE_INFO FileSize; + if (!GetFileSizeEx(FromFile, &FileSize.EndOfFile)) + { + return false; + } + + FILE_BASIC_INFO BasicInfo; + if (!GetFileInformationByHandleEx(FromFile, FileBasicInfo, &BasicInfo, sizeof BasicInfo)) + { + return false; + } + + DWORD dwBytesReturned = 0; + FSCTL_GET_INTEGRITY_INFORMATION_BUFFER GetIntegrityInfoBuffer; + if (!DeviceIoControl(FromFile, + FSCTL_GET_INTEGRITY_INFORMATION, + nullptr, + 0, + &GetIntegrityInfoBuffer, + sizeof GetIntegrityInfoBuffer, + &dwBytesReturned, + nullptr)) + { + return false; + } + + SetFileAttributesW(ToPath.c_str(), FILE_ATTRIBUTE_NORMAL); + + ATL::CHandle TargetFile(CreateFileW(ToPath.c_str(), + GENERIC_READ | GENERIC_WRITE | DELETE, + /* no sharing */ FILE_SHARE_READ, + nullptr, + OPEN_ALWAYS, + 0, + /* hTemplateFile */ FromFile)); + + if (TargetFile == INVALID_HANDLE_VALUE) + { + TargetFile.Detach(); + return false; + } + + // Delete target file when handle is closed (we only reset this if the copy succeeds) + FILE_DISPOSITION_INFO FileDisposition = {TRUE}; + if (!SetFileInformationByHandle(TargetFile, FileDispositionInfo, &FileDisposition, sizeof FileDisposition)) + { + TargetFile.Close(); + DeleteFileW(ToPath.c_str()); + return false; + } + + // Make file sparse so we don't end up allocating space when we change the file size + if (!DeviceIoControl(TargetFile, FSCTL_SET_SPARSE, nullptr, 0, nullptr, 0, &dwBytesReturned, nullptr)) + { + return false; + } + + // Copy integrity checking information + FSCTL_SET_INTEGRITY_INFORMATION_BUFFER IntegritySet = {GetIntegrityInfoBuffer.ChecksumAlgorithm, + GetIntegrityInfoBuffer.Reserved, + GetIntegrityInfoBuffer.Flags}; + if (!DeviceIoControl(TargetFile, FSCTL_SET_INTEGRITY_INFORMATION, &IntegritySet, sizeof IntegritySet, nullptr, 0, nullptr, nullptr)) + { + return false; + } + + // Resize file - note that the file is sparse at this point so no additional data will be written + if (!SetFileInformationByHandle(TargetFile, FileEndOfFileInfo, &FileSize, sizeof FileSize)) + { + return false; + } + + constexpr auto RoundToClusterSize = [](LONG64 FileSize, ULONG ClusterSize) -> LONG64 { + return (FileSize + ClusterSize - 1) / ClusterSize * ClusterSize; + }; + static_assert(RoundToClusterSize(5678, 4 * 1024) == 8 * 1024); + + // Loop for cloning file contents. This is necessary as the API has a 32-bit size + // limit for some reason + + const LONG64 SplitThreshold = (1LL << 32) - GetIntegrityInfoBuffer.ClusterSizeInBytes; + + DUPLICATE_EXTENTS_DATA DuplicateExtentsData{.FileHandle = FromFile}; + + for (LONG64 CurrentByteOffset = 0, + RemainingBytes = RoundToClusterSize(FileSize.EndOfFile.QuadPart, GetIntegrityInfoBuffer.ClusterSizeInBytes); + RemainingBytes > 0; + CurrentByteOffset += SplitThreshold, RemainingBytes -= SplitThreshold) + { + DuplicateExtentsData.SourceFileOffset.QuadPart = CurrentByteOffset; + DuplicateExtentsData.TargetFileOffset.QuadPart = CurrentByteOffset; + DuplicateExtentsData.ByteCount.QuadPart = std::min(SplitThreshold, RemainingBytes); + + if (!DeviceIoControl(TargetFile, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, + &DuplicateExtentsData, + sizeof DuplicateExtentsData, + nullptr, + 0, + &dwBytesReturned, + nullptr)) + { + return false; + } + } + + // Make the file not sparse again now that we have populated the contents + if (!(BasicInfo.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE)) + { + FILE_SET_SPARSE_BUFFER SetSparse = {FALSE}; + + if (!DeviceIoControl(TargetFile, FSCTL_SET_SPARSE, &SetSparse, sizeof SetSparse, nullptr, 0, &dwBytesReturned, nullptr)) + { + return false; + } + } + + // Update timestamps (but don't lie about the creation time) + BasicInfo.CreationTime.QuadPart = 0; + if (!SetFileInformationByHandle(TargetFile, FileBasicInfo, &BasicInfo, sizeof BasicInfo)) + { + return false; + } + + if (!FlushFileBuffers(TargetFile)) + { + return false; + } + + // Finally now everything is done - make sure the file is not deleted on close! + + FileDisposition = {FALSE}; + + const bool AllOk = (TRUE == SetFileInformationByHandle(TargetFile, FileDispositionInfo, &FileDisposition, sizeof FileDisposition)); + + return AllOk; +} + +bool +CopyFile(std::filesystem::path FromPath, std::filesystem::path ToPath, const CopyFileOptions& Options) +{ + bool Success = false; + + if (Options.EnableClone) + { + Success = CloneFile(FromPath.native(), ToPath.native()); + + if (Success) + { + return true; + } + } + + if (Options.MustClone) + { + return false; + } + + BOOL CancelFlag = FALSE; + Success = !!::CopyFileExW(FromPath.c_str(), + ToPath.c_str(), + /* lpProgressRoutine */ nullptr, + /* lpData */ nullptr, + &CancelFlag, + /* dwCopyFlags */ 0); + + if (!Success) + { + throw std::system_error(std::error_code(::GetLastError(), std::system_category()), "file copy failed"); + } + + return Success; +} + +bool +WriteFile(std::filesystem::path Path, const IoBuffer* const* Data, size_t BufferCount) +{ + using namespace fmt::literals; + + CAtlFile Outfile; + HRESULT hRes = Outfile.Create(Path.c_str(), GENERIC_WRITE, FILE_SHARE_READ, CREATE_ALWAYS); + if (FAILED(hRes)) + { + zen::ThrowIfFailed(hRes, "File open failed for '{}'"_format(Path).c_str()); + } + + // TODO: this could be block-enlightened + + for (size_t i = 0; i < BufferCount; ++i) + { + uint64_t WriteSize = Data[i]->Size(); + const void* DataPtr = Data[i]->Data(); + + while (WriteSize) + { + uint64_t ChunkSize = zen::Min<uint64_t>(WriteSize, uint64_t(2) * 1024 * 1024 * 1024); + + hRes = Outfile.Write(DataPtr, gsl::narrow_cast<uint32_t>(WriteSize)); + + if (FAILED(hRes)) + { + zen::ThrowIfFailed(hRes, "File write failed for '{}'"_format(Path).c_str()); + } + + WriteSize -= ChunkSize; + DataPtr = reinterpret_cast<const uint8_t*>(DataPtr) + ChunkSize; + } + } + + return true; +} + +FileContents +ReadFile(std::filesystem::path Path) +{ + ATL::CHandle FromFile(CreateFileW(Path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, 0, nullptr)); + if (FromFile == INVALID_HANDLE_VALUE) + { + FromFile.Detach(); + return FileContents{.ErrorCode = std::error_code(::GetLastError(), std::system_category())}; + } + + FILE_END_OF_FILE_INFO FileSize; + if (!GetFileSizeEx(FromFile, &FileSize.EndOfFile)) + { + return FileContents{.ErrorCode = std::error_code(::GetLastError(), std::system_category())}; + } + + const uint64_t FileSizeBytes = FileSize.EndOfFile.QuadPart; + + FileContents Contents; + + Contents.Data.emplace_back(IoBuffer(IoBuffer::File, FromFile.Detach(), 0, FileSizeBytes)); + + return Contents; +} + +bool +ScanFile(std::filesystem::path Path, const uint64_t ChunkSize, std::function<void(const void* Data, size_t Size)>&& ProcessFunc) +{ + ATL::CHandle FromFile(CreateFileW(Path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, 0, nullptr)); + if (FromFile == INVALID_HANDLE_VALUE) + { + FromFile.Detach(); + return false; + } + + std::vector<uint8_t> ReadBuffer(ChunkSize); + + for (;;) + { + DWORD dwBytesRead = 0; + BOOL Success = ::ReadFile(FromFile, ReadBuffer.data(), (DWORD)ReadBuffer.size(), &dwBytesRead, nullptr); + + if (!Success) + { + throw std::system_error(std::error_code(::GetLastError(), std::system_category()), "file scan failed"); + } + + if (dwBytesRead == 0) + break; + + ProcessFunc(ReadBuffer.data(), dwBytesRead); + } + + return true; +} + +std::string +ToUtf8(const std::filesystem::path& Path) +{ + return WideToUtf8(Path.native().c_str()); +} + +void +FileSystemTraversal::TraverseFileSystem(const std::filesystem::path& RootDir, TreeVisitor& Visitor) +{ + uint64_t FileInfoBuffer[8 * 1024]; + + FILE_INFO_BY_HANDLE_CLASS FibClass = FileIdBothDirectoryRestartInfo; + bool Continue = true; + + std::wstring RootDirPath = RootDir.native(); + + CAtlFile RootDirHandle; + HRESULT hRes = RootDirHandle.Create(RootDirPath.c_str(), + GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS); + + zen::ThrowIfFailed(hRes, "Failed to open handle to volume root"); + + while (Continue) + { + BOOL Success = GetFileInformationByHandleEx(RootDirHandle, FibClass, FileInfoBuffer, sizeof FileInfoBuffer); + FibClass = FileIdBothDirectoryInfo; // Set up for next iteration + + uint64_t EntryOffset = 0; + + if (!Success) + { + DWORD LastError = GetLastError(); + + if (LastError == ERROR_NO_MORE_FILES) + { + break; + } + + throw std::system_error(std::error_code(LastError, std::system_category()), "file system traversal error"); + } + + for (;;) + { + const FILE_ID_BOTH_DIR_INFO* DirInfo = + reinterpret_cast<const FILE_ID_BOTH_DIR_INFO*>(reinterpret_cast<const uint8_t*>(FileInfoBuffer) + EntryOffset); + + std::wstring_view FileName(DirInfo->FileName, DirInfo->FileNameLength / sizeof(wchar_t)); + + if (DirInfo->FileAttributes & FILE_ATTRIBUTE_DIRECTORY) + { + if (FileName == L"."sv || FileName == L".."sv) + { + // Not very interesting + } + else + { + const bool ShouldDescend = Visitor.VisitDirectory(RootDir, FileName); + + if (ShouldDescend) + { + // Note that this recursion combined with the buffer could + // blow the stack, we should consider a different strategy + + std::filesystem::path FullPath = RootDir / FileName; + + TraverseFileSystem(FullPath, Visitor); + } + } + } + else if (DirInfo->FileAttributes & FILE_ATTRIBUTE_DEVICE) + { + spdlog::warn("encountered device node during file system traversal: {} found in {}", WideToUtf8(FileName), RootDir); + } + else + { + std::filesystem::path FullPath = RootDir / FileName; + + Visitor.VisitFile(RootDir, FileName, DirInfo->EndOfFile.QuadPart); + } + + const uint64_t NextOffset = DirInfo->NextEntryOffset; + + if (NextOffset == 0) + { + break; + } + + EntryOffset += NextOffset; + } + } +} + +} // namespace zen diff --git a/zencore/httpclient.cpp b/zencore/httpclient.cpp new file mode 100644 index 000000000..268483403 --- /dev/null +++ b/zencore/httpclient.cpp @@ -0,0 +1,23 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/httpclient.h> + +#include <spdlog/spdlog.h> + +#include <doctest/doctest.h> + +namespace zen { + +TEST_CASE("httpclient") +{ + using namespace std::literals; + + SUBCASE("client") {} +} + +void +httpclient_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/httpserver.cpp b/zencore/httpserver.cpp new file mode 100644 index 000000000..52389a11b --- /dev/null +++ b/zencore/httpserver.cpp @@ -0,0 +1,1459 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/httpserver.h> + +#define _WINSOCKAPI_ +#include <zencore/windows.h> +#include "iothreadpool.h" + +#include <atlbase.h> +#include <conio.h> +#include <http.h> +#include <new.h> +#include <zencore/compactbinary.h> +#include <zencore/iobuffer.h> +#include <zencore/refcount.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <charconv> +#include <span> +#include <string_view> + +#include <spdlog/spdlog.h> + +#include <doctest/doctest.h> + +#pragma comment(lib, "httpapi.lib") + +////////////////////////////////////////////////////////////////////////// + +std::wstring +UTF8_to_wstring(const char* in) +{ + std::wstring out; + unsigned int codepoint; + + while (*in != 0) + { + unsigned char ch = static_cast<unsigned char>(*in); + + if (ch <= 0x7f) + codepoint = ch; + else if (ch <= 0xbf) + codepoint = (codepoint << 6) | (ch & 0x3f); + else if (ch <= 0xdf) + codepoint = ch & 0x1f; + else if (ch <= 0xef) + codepoint = ch & 0x0f; + else + codepoint = ch & 0x07; + + ++in; + + if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) + { + if (sizeof(wchar_t) > 2) + { + out.append(1, static_cast<wchar_t>(codepoint)); + } + else if (codepoint > 0xffff) + { + out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10))); + out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff))); + } + else if (codepoint < 0xd800 || codepoint >= 0xe000) + { + out.append(1, static_cast<wchar_t>(codepoint)); + } + } + } + + return out; +} + +////////////////////////////////////////////////////////////////////////// + +const char* +ReasonStringForHttpResultCode(int HttpCode) +{ + switch (HttpCode) + { + // 1xx Informational + + case 100: + return "Continue"; + case 101: + return "Switching Protocols"; + + // 2xx Success + + case 200: + return "OK"; + case 201: + return "Created"; + case 202: + return "Accepted"; + case 204: + return "No Content"; + case 205: + return "Reset Content"; + case 206: + return "Partial Content"; + + // 3xx Redirection + + case 300: + return "Multiple Choices"; + case 301: + return "Moved Permanently"; + case 302: + return "Found"; + case 303: + return "See Other"; + case 304: + return "Not Modified"; + case 305: + return "Use Proxy"; + case 306: + return "Switch Proxy"; + case 307: + return "Temporary Redirect"; + case 308: + return "Permanent Redirect"; + + // 4xx Client errors + + case 400: + return "Bad Request"; + case 401: + return "Unauthorized"; + case 402: + return "Payment Required"; + case 403: + return "Forbidden"; + case 404: + return "Not Found"; + case 405: + return "Method Not Allowed"; + case 406: + return "Not Acceptable"; + case 407: + return "Proxy Authentication Required"; + case 408: + return "Request Timeout"; + case 409: + return "Conflict"; + case 410: + return "Gone"; + case 411: + return "Length Required"; + case 412: + return "Precondition Failed"; + case 413: + return "Payload Too Large"; + case 414: + return "URI Too Long"; + case 415: + return "Unsupported Media Type"; + case 416: + return "Range Not Satisifiable"; + case 417: + return "Expectation Failed"; + case 418: + return "I'm a teapot"; + case 421: + return "Misdirected Request"; + case 422: + return "Unprocessable Entity"; + case 423: + return "Locked"; + case 424: + return "Failed Dependency"; + case 425: + return "Too Early"; + case 426: + return "Upgrade Required"; + case 428: + return "Precondition Required"; + case 429: + return "Too Many Requests"; + case 431: + return "Request Header Fields Too Large"; + + // 5xx Server errors + + case 500: + return "Internal Server Error"; + case 501: + return "Not Implemented"; + case 502: + return "Bad Gateway"; + case 503: + return "Service Unavailable"; + case 504: + return "Gateway Timeout"; + case 505: + return "HTTP Version Not Supported"; + case 506: + return "Variant Also Negotiates"; + case 507: + return "Insufficient Storage"; + case 508: + return "Loop Detected"; + case 510: + return "Not Extended"; + case 511: + return "Network Authentication Required"; + + default: + return "Unknown Result"; + } +} + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +HttpServerRequest::HttpServerRequest() +{ +} + +HttpServerRequest::~HttpServerRequest() +{ +} + +void +HttpServerRequest::WriteResponse(HttpResponse HttpResponseCode, CbObject Data) +{ +#if 0 + struct Visitor : public ICbVisitor + { + virtual void SetName(std::string_view Name) override { OutText << '\'' << Name << "': "; } + virtual void BeginObject() override { OutText << "{ "; } + virtual void EndObject() override { OutText << "}"; } + virtual void BeginArray() override { OutText << "[ "; } + virtual void EndArray() override { OutText << " ]"; } + + virtual void VisitNull() override { OutText << "null"; } + virtual void VisitBinary(SharedBuffer Value) override { ZEN_UNUSED(Value); } + virtual void VisitString(std::string_view Value) override { ZEN_UNUSED(Value); } + virtual void VisitInteger(int64_t Value) override { OutText << Value; } + virtual void VisitInteger(uint64_t Value) override { OutText << Value; } + virtual void VisitFloat(float Value) override { ZEN_UNUSED(Value); } + virtual void VisitDouble(double Value) override { ZEN_UNUSED(Value); } + virtual void VisitBool(bool Value) override { OutText << Value; } + virtual void VisitCbAttachment(const IoHash& Value) override { ZEN_UNUSED(Value); } + virtual void VisitBinaryAttachment(const IoHash& Value) override { ZEN_UNUSED(Value); } + virtual void VisitHash(const IoHash& Value) override { ZEN_UNUSED(Value); } + virtual void VisitUuid(const Guid& Value) override { ZEN_UNUSED(Value); } + virtual void VisitObjectId(const Oid& Value) override { ZEN_UNUSED(Value); } + virtual void VisitDateTime(DateTime Value) override { ZEN_UNUSED(Value); } + virtual void VisitTimeSpan(TimeSpan Value) override { ZEN_UNUSED(Value); } + + ExtendableStringBuilder<256> OutText; + } _; + // Data.CreateRefIterator().VisitFields(_); + return WriteResponse(HttpResponseCode, HttpContentType::kJSON, _.OutText); +#else + SharedBuffer Buf = Data.GetBuffer(); + std::array<IoBuffer, 1> buffers{IoBufferBuilder::MakeCloneFromMemory(Buf.GetData(), Buf.GetSize())}; + return WriteResponse(HttpResponseCode, HttpContentType::kCbObject, buffers); +#endif +} + +void +HttpServerRequest::WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::string_view ResponseString) +{ + return WriteResponse(HttpResponseCode, ContentType, std::u8string_view{(char8_t*)ResponseString.data(), ResponseString.size()}); +} + +void +HttpServerRequest::WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, IoBuffer Blob) +{ + std::array<IoBuffer, 1> buffers{Blob}; + return WriteResponse(HttpResponseCode, ContentType, buffers); +} + +HttpServerRequest::QueryParams +HttpServerRequest::GetQueryParams() +{ + QueryParams Params; + + const std::string_view QStr = QueryString(); + + const char* QueryIt = QStr.data(); + const char* QueryEnd = QueryIt + QStr.size(); + + while (QueryIt != QueryEnd) + { + if (*QueryIt == '&') + { + ++QueryIt; + continue; + } + + const std::string_view Query{QueryIt, QueryEnd}; + + size_t DelimIndex = Query.find('&', 0); + + if (DelimIndex == std::string_view::npos) + { + DelimIndex = Query.size(); + } + + std::string_view ThisQuery{QueryIt, DelimIndex}; + + size_t EqIndex = ThisQuery.find('=', 0); + + if (EqIndex != std::string_view::npos) + { + std::string_view Parm{ThisQuery.data(), EqIndex}; + ThisQuery.remove_prefix(EqIndex + 1); + + Params.KvPairs.emplace_back(Parm, ThisQuery); + } + + QueryIt += DelimIndex; + } + + return std::move(Params); +} + +////////////////////////////////////////////////////////////////////////// +// +// HTTP +// + +class HttpSysServer; +class HttpTransaction; + +class HttpSysRequestHandler +{ +public: + HttpSysRequestHandler(HttpTransaction& InRequest) : m_Request(InRequest) {} + virtual ~HttpSysRequestHandler() = default; + + virtual void IssueRequest() = 0; + virtual HttpSysRequestHandler* HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) = 0; + + HttpTransaction& Transaction() { return m_Request; } + +private: + HttpTransaction& m_Request; // Outermost HTTP transaction object +}; + +/** HTTP transaction + + There will be an instance of this per pending and in-flight HTTP transaction + + */ +class HttpTransaction +{ +public: + HttpTransaction(HttpSysServer& Server) : m_HttpServer(Server), m_HttpHandler(&m_InitialHttpHandler) {} + + virtual ~HttpTransaction() {} + + enum class Status + { + kDone, + kRequestPending + }; + + Status HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred); + + static void __stdcall IoCompletionCallback(PTP_CALLBACK_INSTANCE Instance, + PVOID pContext /* HttpSysServer */, + PVOID pOverlapped, + ULONG IoResult, + ULONG_PTR NumberOfBytesTransferred, + PTP_IO Io) + { + UNREFERENCED_PARAMETER(Io); + UNREFERENCED_PARAMETER(Instance); + UNREFERENCED_PARAMETER(pContext); + + // Note that for a given transaction we may be in this completion function on more + // than one thread at any given moment. This means we need to be careful about what + // happens in here + + HttpTransaction* Transaction = CONTAINING_RECORD(pOverlapped, HttpTransaction, m_HttpOverlapped); + + if (Transaction->HandleCompletion(IoResult, NumberOfBytesTransferred) == HttpTransaction::Status::kDone) + { + delete Transaction; + } + } + + void IssueInitialRequest(); + + PTP_IO Iocp(); + HANDLE RequestQueueHandle(); + inline OVERLAPPED* Overlapped() { return &m_HttpOverlapped; } + inline HttpSysServer& Server() { return m_HttpServer; } + + inline PHTTP_REQUEST HttpRequest() { return m_InitialHttpHandler.HttpRequest(); } + +protected: + OVERLAPPED m_HttpOverlapped{}; + HttpSysServer& m_HttpServer; + HttpSysRequestHandler* m_HttpHandler{nullptr}; + RwLock m_Lock; + +private: + struct InitialRequestHandler : public HttpSysRequestHandler + { + inline PHTTP_REQUEST HttpRequest() { return (PHTTP_REQUEST)m_RequestBuffer; } + inline uint32_t RequestBufferSize() const { return sizeof m_RequestBuffer; } + + InitialRequestHandler(HttpTransaction& InRequest) : HttpSysRequestHandler(InRequest) {} + ~InitialRequestHandler() {} + + virtual void IssueRequest() override; + virtual HttpSysRequestHandler* HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) override; + + PHTTP_REQUEST m_HttpRequestPtr = (HTTP_REQUEST*)(m_RequestBuffer); + UCHAR m_RequestBuffer[16384 + sizeof(HTTP_REQUEST)]; + } m_InitialHttpHandler{*this}; +}; + +////////////////////////////////////////////////////////////////////////// + +class HttpMessageResponseRequest : public HttpSysRequestHandler +{ +public: + HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode); + HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode, const char* Message); + HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode, const void* Payload, size_t PayloadSize); + HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode, std::span<IoBuffer> Blobs); + ~HttpMessageResponseRequest(); + + virtual void IssueRequest() override; + virtual HttpSysRequestHandler* HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) override; + + void SuppressResponseBody(); + +private: + std::vector<HTTP_DATA_CHUNK> m_HttpDataChunks; + uint64_t m_TotalDataSize = 0; // Sum of all chunk sizes + + uint16_t m_HttpResponseCode = 0; + uint32_t m_NextDataChunkOffset = 0; // This is used for responses where the number of chunks exceed the maximum number for one API call + uint32_t m_RemainingChunkCount = 0; + bool m_IsInitialResponse = true; + + void Initialize(uint16_t ResponseCode, std::span<IoBuffer> Blobs); + + std::vector<IoBuffer> m_DataBuffers; +}; + +HttpMessageResponseRequest::HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode) : HttpSysRequestHandler(InRequest) +{ + std::array<IoBuffer, 0> buffers; + + Initialize(ResponseCode, buffers); +} + +HttpMessageResponseRequest::HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode, const char* Message) +: HttpSysRequestHandler(InRequest) +{ + IoBuffer MessageBuffer(IoBuffer::Wrap, Message, strlen(Message)); + std::array<IoBuffer, 1> buffers({MessageBuffer}); + + Initialize(ResponseCode, buffers); +} + +HttpMessageResponseRequest::HttpMessageResponseRequest(HttpTransaction& InRequest, + uint16_t ResponseCode, + const void* Payload, + size_t PayloadSize) +: HttpSysRequestHandler(InRequest) +{ + IoBuffer MessageBuffer(IoBuffer::Wrap, Payload, PayloadSize); + std::array<IoBuffer, 1> buffers({MessageBuffer}); + + Initialize(ResponseCode, buffers); +} + +HttpMessageResponseRequest::HttpMessageResponseRequest(HttpTransaction& InRequest, uint16_t ResponseCode, std::span<IoBuffer> Blobs) +: HttpSysRequestHandler(InRequest) +{ + Initialize(ResponseCode, Blobs); +} + +HttpMessageResponseRequest::~HttpMessageResponseRequest() +{ +} + +void +HttpMessageResponseRequest::Initialize(uint16_t ResponseCode, std::span<IoBuffer> Blobs) +{ + m_HttpResponseCode = ResponseCode; + + const uint32_t ChunkCount = (uint32_t)Blobs.size(); + + m_HttpDataChunks.resize(ChunkCount); + m_DataBuffers.reserve(ChunkCount); + + for (IoBuffer& Buffer : Blobs) + { + m_DataBuffers.emplace_back(std::move(Buffer)).MakeOwned(); + } + + // Initialize the full array up front + + uint64_t LocalDataSize = 0; + + { + PHTTP_DATA_CHUNK ChunkPtr = m_HttpDataChunks.data(); + + for (IoBuffer& Buffer : m_DataBuffers) + { + const ULONG BufferDataSize = (ULONG)Buffer.Size(); + + ZEN_ASSERT(BufferDataSize); + + IoBufferFileReference FileRef; + if (Buffer.GetFileReference(/* out */ FileRef)) + { + ChunkPtr->DataChunkType = HttpDataChunkFromFileHandle; + ChunkPtr->FromFileHandle.FileHandle = FileRef.FileHandle; + ChunkPtr->FromFileHandle.ByteRange.StartingOffset.QuadPart = FileRef.FileChunkOffset; + ChunkPtr->FromFileHandle.ByteRange.Length.QuadPart = BufferDataSize; + } + else + { + ChunkPtr->DataChunkType = HttpDataChunkFromMemory; + ChunkPtr->FromMemory.pBuffer = (void*)Buffer.Data(); + ChunkPtr->FromMemory.BufferLength = BufferDataSize; + } + ++ChunkPtr; + + LocalDataSize += BufferDataSize; + } + } + + m_RemainingChunkCount = ChunkCount; + m_TotalDataSize = LocalDataSize; +} + +void +HttpMessageResponseRequest::SuppressResponseBody() +{ + m_RemainingChunkCount = 0; + m_HttpDataChunks.clear(); + m_DataBuffers.clear(); +} + +HttpSysRequestHandler* +HttpMessageResponseRequest::HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) +{ + ZEN_UNUSED(NumberOfBytesTransferred); + ZEN_UNUSED(IoResult); + + if (m_RemainingChunkCount == 0) + return nullptr; // All done + + return this; +} + +void +HttpMessageResponseRequest::IssueRequest() +{ + HttpTransaction& Tx = Transaction(); + HTTP_REQUEST* const HttpReq = Tx.HttpRequest(); + PTP_IO const Iocp = Tx.Iocp(); + + StartThreadpoolIo(Iocp); + + // Split payload into batches to play well with the underlying API + + const int MaxChunksPerCall = 9999; + + const int ThisRequestChunkCount = std::min<int>(m_RemainingChunkCount, MaxChunksPerCall); + const int ThisRequestChunkOffset = m_NextDataChunkOffset; + + m_RemainingChunkCount -= ThisRequestChunkCount; + m_NextDataChunkOffset += ThisRequestChunkCount; + + ULONG SendFlags = 0; + + if (m_RemainingChunkCount) + { + // We need to make more calls to send the full amount of data + SendFlags |= HTTP_SEND_RESPONSE_FLAG_MORE_DATA; + } + + ULONG SendResult = 0; + + if (m_IsInitialResponse) + { + // Populate response structure + + HTTP_RESPONSE HttpResponse = {}; + + HttpResponse.EntityChunkCount = USHORT(ThisRequestChunkCount); + HttpResponse.pEntityChunks = m_HttpDataChunks.data() + ThisRequestChunkOffset; + + // Content-length header + + char ContentLengthString[32]; + _ui64toa_s(m_TotalDataSize, ContentLengthString, sizeof ContentLengthString, 10); + + PHTTP_KNOWN_HEADER ContentLengthHeader = &HttpResponse.Headers.KnownHeaders[HttpHeaderContentLength]; + ContentLengthHeader->pRawValue = ContentLengthString; + ContentLengthHeader->RawValueLength = (USHORT)strlen(ContentLengthString); + + // Content-type header + + PHTTP_KNOWN_HEADER ContentTypeHeader = &HttpResponse.Headers.KnownHeaders[HttpHeaderContentType]; + + ContentTypeHeader->pRawValue = "application/octet-stream"; /* TODO! We must respect the content type specified */ + ContentTypeHeader->RawValueLength = (USHORT)strlen(ContentTypeHeader->pRawValue); + + HttpResponse.StatusCode = m_HttpResponseCode; + HttpResponse.pReason = ReasonStringForHttpResultCode(m_HttpResponseCode); + HttpResponse.ReasonLength = (USHORT)strlen(HttpResponse.pReason); + + // Cache policy + + HTTP_CACHE_POLICY CachePolicy; + + CachePolicy.Policy = HttpCachePolicyNocache; // HttpCachePolicyUserInvalidates; + CachePolicy.SecondsToLive = 0; + + // Initial response API call + + SendResult = HttpSendHttpResponse(Tx.RequestQueueHandle(), + HttpReq->RequestId, + SendFlags, + &HttpResponse, + &CachePolicy, + NULL, + NULL, + 0, + Tx.Overlapped(), + NULL); + + m_IsInitialResponse = false; + } + else + { + // Subsequent response API calls + + SendResult = HttpSendResponseEntityBody(Tx.RequestQueueHandle(), + HttpReq->RequestId, + SendFlags, + (USHORT)ThisRequestChunkCount, // EntityChunkCount + &m_HttpDataChunks[ThisRequestChunkOffset], // EntityChunks + NULL, // BytesSent + NULL, // Reserved1 + 0, // Reserved2 + Tx.Overlapped(), // Overlapped + NULL // LogData + ); + } + + if ((SendResult != NO_ERROR) // Synchronous completion, but the completion event will still be posted to IOCP + && (SendResult != ERROR_IO_PENDING) // Asynchronous completion + ) + { + // Some error occurred, no completion will be posted + + CancelThreadpoolIo(Iocp); + + spdlog::error("failed to send HTTP response (error: {}) URL: {}", SendResult, HttpReq->pRawUrl); + + throw HttpServerException("Failed to send HTTP response", SendResult); + } +} + +////////////////////////////////////////////////////////////////////////// + +class HttpSysServer +{ + friend class HttpTransaction; + +public: + HttpSysServer(WinIoThreadPool& InThreadPool); + ~HttpSysServer(); + + void Initialize(const wchar_t* UrlPath); + void Run(bool TestMode); + + void RequestExit() { m_ShutdownEvent.Set(); } + + void StartServer(); + void StopServer(); + + void OnHandlingRequest(); + void IssueNewRequestMaybe(); + + inline bool IsOk() const { return m_IsOk; } + + void AddEndpoint(const char* Endpoint, HttpService& Service); + void RemoveEndpoint(const char* Endpoint, HttpService& Service); + +private: + bool m_IsOk = false; + bool m_IsHttpInitialized = false; + WinIoThreadPool& m_ThreadPool; + + std::wstring m_BaseUri; // http://*:nnnn/ + HTTP_SERVER_SESSION_ID m_HttpSessionId = 0; + HTTP_URL_GROUP_ID m_HttpUrlGroupId = 0; + HANDLE m_RequestQueueHandle = 0; + std::atomic_int32_t m_PendingRequests{0}; + int32_t m_MinPendingRequests = 4; + int32_t m_MaxPendingRequests = 32; + Event m_ShutdownEvent; +}; + +HttpSysServer::HttpSysServer(WinIoThreadPool& InThreadPool) : m_ThreadPool(InThreadPool) +{ + ULONG Result = HttpInitialize(HTTPAPI_VERSION_2, HTTP_INITIALIZE_SERVER, nullptr); + + if (Result != NO_ERROR) + { + return; + } + + m_IsHttpInitialized = true; + m_IsOk = true; +} + +HttpSysServer::~HttpSysServer() +{ + if (m_IsHttpInitialized) + { + HttpTerminate(HTTP_INITIALIZE_SERVER, nullptr); + } +} + +void +HttpSysServer::Initialize(const wchar_t* UrlPath) +{ + // check(bIsOk); + + ULONG Result = HttpCreateServerSession(HTTPAPI_VERSION_2, &m_HttpSessionId, 0); + + if (Result != NO_ERROR) + { + // Flag error + + return; + } + + Result = HttpCreateUrlGroup(m_HttpSessionId, &m_HttpUrlGroupId, 0); + + if (Result != NO_ERROR) + { + // Flag error + + return; + } + + m_BaseUri = UrlPath; + + Result = HttpAddUrlToUrlGroup(m_HttpUrlGroupId, UrlPath, /* #TODO UrlContext */ HTTP_URL_CONTEXT(0), 0); + + if (Result != NO_ERROR) + { + // Flag error + + return; + } + + HTTP_BINDING_INFO HttpBindingInfo = {{0}, 0}; + + Result = HttpCreateRequestQueue(HTTPAPI_VERSION_2, NULL, NULL, 0, &m_RequestQueueHandle); + + if (Result != NO_ERROR) + { + // Flag error! + + return; + } + + HttpBindingInfo.Flags.Present = 1; + HttpBindingInfo.RequestQueueHandle = m_RequestQueueHandle; + + Result = HttpSetUrlGroupProperty(m_HttpUrlGroupId, HttpServerBindingProperty, &HttpBindingInfo, sizeof(HttpBindingInfo)); + + if (Result != NO_ERROR) + { + // Flag error! + + return; + } + + // Create I/O completion port + + m_ThreadPool.CreateIocp(m_RequestQueueHandle, HttpTransaction::IoCompletionCallback, this); + + // Check result! +} + +void +HttpSysServer::StartServer() +{ + int RequestCount = 32; + + for (int i = 0; i < RequestCount; ++i) + { + IssueNewRequestMaybe(); + } +} + +void +HttpSysServer::Run(bool TestMode) +{ + if (TestMode == false) + { + printf("Zen Server running. Press ESC or Q to quit\n"); + } + + bool KeepRunning = true; + + do + { + int WaitTimeout = -1; + + if (!TestMode) + { + WaitTimeout = 1000; + } + + if (!TestMode && _kbhit() != 0) + { + char c = (char)_getch(); + + if (c == 27 || c == 'Q' || c == 'q') + { + RequestApplicationExit(0); + } + } + + m_ShutdownEvent.Wait(WaitTimeout); + } while (!IsApplicationExitRequested()); +} + +void +HttpSysServer::OnHandlingRequest() +{ + --m_PendingRequests; + + if (m_PendingRequests > m_MinPendingRequests) + { + // We have more than the minimum number of requests pending, just let someone else + // enqueue new requests + return; + } + + IssueNewRequestMaybe(); +} + +void +HttpSysServer::IssueNewRequestMaybe() +{ + if (m_PendingRequests.load(std::memory_order::relaxed) >= m_MaxPendingRequests) + { + return; + } + + std::unique_ptr<HttpTransaction> Request = std::make_unique<HttpTransaction>(*this); + + Request->IssueInitialRequest(); + + // This may end up exceeding the MaxPendingRequests limit, but it's not + // really a problem. I'm doing it this way mostly to avoid dealing with + // exceptions here + ++m_PendingRequests; + + Request.release(); +} + +void +HttpSysServer::StopServer() +{ +} + +void +HttpSysServer::AddEndpoint(const char* UrlPath, HttpService& Service) +{ + if (UrlPath[0] == '/') + { + ++UrlPath; + } + + const std::wstring Path16 = UTF8_to_wstring(UrlPath); + Service.SetUriPrefixLength(Path16.size() + 1 /* leading slash */); + + // Convert to wide string + + std::wstring Url16 = m_BaseUri + Path16; + + ULONG Result = HttpAddUrlToUrlGroup(m_HttpUrlGroupId, Url16.c_str(), HTTP_URL_CONTEXT(&Service), 0 /* Reserved */); + + if (Result != NO_ERROR) + { + spdlog::error("HttpAddUrlToUrlGroup failed with result {}", Result); + + return; + } +} + +void +HttpSysServer::RemoveEndpoint(const char* UrlPath, HttpService& Service) +{ + ZEN_UNUSED(Service); + + if (UrlPath[0] == '/') + { + ++UrlPath; + } + + const std::wstring Path16 = UTF8_to_wstring(UrlPath); + + // Convert to wide string + + std::wstring Url16 = m_BaseUri + Path16; + + ULONG Result = HttpRemoveUrlFromUrlGroup(m_HttpUrlGroupId, Url16.c_str(), 0); + + if (Result != NO_ERROR) + { + spdlog::error("HttpRemoveUrlFromUrlGroup failed with result {}", Result); + } +} + +////////////////////////////////////////////////////////////////////////// + +class HttpSysServerRequest : public HttpServerRequest +{ +public: + HttpSysServerRequest(HttpTransaction& Tx, HttpService& Service) : m_HttpTx(Tx) + { + PHTTP_REQUEST HttpRequestPtr = Tx.HttpRequest(); + + const int PrefixLength = Service.UriPrefixLength(); + const int AbsPathLength = HttpRequestPtr->CookedUrl.AbsPathLength / sizeof(char16_t); + + if (AbsPathLength >= PrefixLength) + { + // We convert the URI immediately because most of the code involved prefers to deal + // with utf8. This has some performance impact which I'd prefer to avoid but for now + // we just have to live with it + + WideToUtf8({(char16_t*)HttpRequestPtr->CookedUrl.pAbsPath + PrefixLength, gsl::narrow<size_t>(AbsPathLength - PrefixLength)}, + m_Uri); + } + else + { + m_Uri.Reset(); + } + + if (auto QueryStringLength = HttpRequestPtr->CookedUrl.QueryStringLength) + { + --QueryStringLength; + + WideToUtf8({(char16_t*)(HttpRequestPtr->CookedUrl.pQueryString) + 1, QueryStringLength / sizeof(char16_t)}, m_QueryString); + } + else + { + m_QueryString.Reset(); + } + + switch (HttpRequestPtr->Verb) + { + case HttpVerbOPTIONS: + m_Verb = HttpVerb::kOptions; + break; + + case HttpVerbGET: + m_Verb = HttpVerb::kGet; + break; + + case HttpVerbHEAD: + m_Verb = HttpVerb::kHead; + break; + + case HttpVerbPOST: + m_Verb = HttpVerb::kPost; + break; + + case HttpVerbPUT: + m_Verb = HttpVerb::kPut; + break; + + case HttpVerbDELETE: + m_Verb = HttpVerb::kDelete; + break; + + case HttpVerbCOPY: + m_Verb = HttpVerb::kCopy; + break; + + default: + // TODO: invalid request? + m_Verb = (HttpVerb)0; + break; + } + + auto& clh = HttpRequestPtr->Headers.KnownHeaders[HttpHeaderContentLength]; + std::string_view cl(clh.pRawValue, clh.RawValueLength); + + std::from_chars(cl.data(), cl.data() + cl.size(), m_ContentLength); + } + + ~HttpSysServerRequest() {} + + virtual IoBuffer ReadPayload() override + { + // This is presently synchronous for simplicity, but we + // need to implement an asynchronous version also + + HTTP_REQUEST* const HttpReq = m_HttpTx.HttpRequest(); + + IoBuffer buffer(m_ContentLength); + + uint64_t BytesToRead = m_ContentLength; + + uint8_t* ReadPointer = (uint8_t*)buffer.Data(); + + // First deal with any payload which has already been copied + // into our request buffer + + const int EntityChunkCount = HttpReq->EntityChunkCount; + + for (int i = 0; i < EntityChunkCount; ++i) + { + HTTP_DATA_CHUNK& EntityChunk = HttpReq->pEntityChunks[i]; + + ZEN_ASSERT(EntityChunk.DataChunkType == HttpDataChunkFromMemory); + + const uint64_t BufferLength = EntityChunk.FromMemory.BufferLength; + + ZEN_ASSERT(BufferLength <= BytesToRead); + + memcpy(ReadPointer, EntityChunk.FromMemory.pBuffer, BufferLength); + + ReadPointer += BufferLength; + BytesToRead -= BufferLength; + } + + // Call http.sys API to receive the remaining data + + while (BytesToRead) + { + ULONG BytesRead = 0; + + ULONG ApiResult = HttpReceiveRequestEntityBody(m_HttpTx.RequestQueueHandle(), + HttpReq->RequestId, + 0, /* Flags */ + ReadPointer, + (ULONG)BytesToRead, + &BytesRead, + NULL /* Overlapped */ + ); + + if (ApiResult != NO_ERROR && ApiResult != ERROR_HANDLE_EOF) + { + throw HttpServerException("payload read failed", ApiResult); + } + + BytesToRead -= BytesRead; + ReadPointer += BytesRead; + } + + return buffer; + } + + virtual void WriteResponse(HttpResponse HttpResponseCode) override + { + ZEN_ASSERT(m_IsHandled == false); + + m_Response = new HttpMessageResponseRequest(m_HttpTx, (uint16_t)HttpResponseCode); + + if (m_SuppressBody) + { + m_Response->SuppressResponseBody(); + } + + m_IsHandled = true; + } + + virtual void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::span<IoBuffer> Blobs) override + { + ZEN_ASSERT(m_IsHandled == false); + ZEN_UNUSED(ContentType); + + m_Response = new HttpMessageResponseRequest(m_HttpTx, (uint16_t)HttpResponseCode, Blobs); + + if (m_SuppressBody) + { + m_Response->SuppressResponseBody(); + } + + m_IsHandled = true; + } + + virtual void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::u8string_view ResponseString) override + { + ZEN_ASSERT(m_IsHandled == false); + ZEN_UNUSED(ContentType); + + m_Response = new HttpMessageResponseRequest(m_HttpTx, (uint16_t)HttpResponseCode, ResponseString.data(), ResponseString.size()); + + if (m_SuppressBody) + { + m_Response->SuppressResponseBody(); + } + + m_IsHandled = true; + } + + HttpTransaction& m_HttpTx; + HttpMessageResponseRequest* m_Response = nullptr; +}; + +////////////////////////////////////////////////////////////////////////// + +PTP_IO +HttpTransaction::Iocp() +{ + return m_HttpServer.m_ThreadPool.Iocp(); +} + +HANDLE +HttpTransaction::RequestQueueHandle() +{ + return m_HttpServer.m_RequestQueueHandle; +} + +void +HttpTransaction::IssueInitialRequest() +{ + m_InitialHttpHandler.IssueRequest(); +} + +HttpTransaction::Status +HttpTransaction::HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) +{ + // We use this to ensure sequential execution of completion handlers + // for any given transaction. + RwLock::ExclusiveLockScope _(m_Lock); + + bool RequestPending = false; + + if (HttpSysRequestHandler* CurrentHandler = m_HttpHandler) + { + const bool IsInitialRequest = (CurrentHandler == &m_InitialHttpHandler); + + if (IsInitialRequest) + { + // Ensure we have a sufficient number of pending requests outstanding + m_HttpServer.OnHandlingRequest(); + } + + m_HttpHandler = CurrentHandler->HandleCompletion(IoResult, NumberOfBytesTransferred); + + if (m_HttpHandler) + { + try + { + m_HttpHandler->IssueRequest(); + + RequestPending = true; + } + catch (std::exception& Ex) + { + spdlog::error("exception caught from IssueRequest(): {}", Ex.what()); + + // something went wrong, no request is pending + } + } + else + { + if (IsInitialRequest == false) + { + delete CurrentHandler; + } + } + } + + m_HttpServer.IssueNewRequestMaybe(); + + if (RequestPending) + { + return Status::kRequestPending; + } + + return Status::kDone; +} + +////////////////////////////////////////////////////////////////////////// + +void +HttpTransaction::InitialRequestHandler::IssueRequest() +{ + PTP_IO Iocp = Transaction().Iocp(); + + StartThreadpoolIo(Iocp); + + HttpTransaction& Tx = Transaction(); + + HTTP_REQUEST* HttpReq = Tx.HttpRequest(); + + ULONG Result = HttpReceiveHttpRequest(Tx.RequestQueueHandle(), + HTTP_NULL_ID, + HTTP_RECEIVE_REQUEST_FLAG_COPY_BODY, + HttpReq, + RequestBufferSize(), + NULL, + Tx.Overlapped()); + + if (Result != ERROR_IO_PENDING && Result != NO_ERROR) + { + CancelThreadpoolIo(Iocp); + + if (Result == ERROR_MORE_DATA) + { + // ProcessReceiveAndPostResponse(pIoRequest, pServerContext->Io, ERROR_MORE_DATA); + } + + // CleanupHttpIoRequest(pIoRequest); + + fprintf(stderr, "HttpReceiveHttpRequest failed, error 0x%lx\n", Result); + + return; + } +} + +HttpSysRequestHandler* +HttpTransaction::InitialRequestHandler::HandleCompletion(ULONG IoResult, ULONG_PTR NumberOfBytesTransferred) +{ + ZEN_UNUSED(IoResult); + ZEN_UNUSED(NumberOfBytesTransferred); + + // Route requests + + try + { + if (HttpService* Service = reinterpret_cast<HttpService*>(m_HttpRequestPtr->UrlContext)) + { + HttpSysServerRequest ThisRequest(Transaction(), *Service); + + Service->HandleRequest(ThisRequest); + + if (!ThisRequest.IsHandled()) + { + return new HttpMessageResponseRequest(Transaction(), 404, "Not found"); + } + + if (ThisRequest.m_Response) + { + return ThisRequest.m_Response; + } + } + + // Unable to route + return new HttpMessageResponseRequest(Transaction(), 404, "Item unknown"); + } + catch (std::exception& ex) + { + // TODO provide more meaningful error output + + return new HttpMessageResponseRequest(Transaction(), 500, ex.what()); + } +} + +////////////////////////////////////////////////////////////////////////// + +struct HttpServer::Impl : public RefCounted +{ + WinIoThreadPool m_ThreadPool; + HttpSysServer m_HttpServer; + + Impl(int ThreadCount) : m_ThreadPool(ThreadCount), m_HttpServer(m_ThreadPool) {} + + void Initialize(int BasePort) + { + using namespace std::literals; + + WideStringBuilder<64> BaseUri; + BaseUri << u8"http://*:"sv << int64_t(BasePort) << u8"/"sv; + + m_HttpServer.Initialize(BaseUri.c_str()); + m_HttpServer.StartServer(); + } + + void Run(bool TestMode) { m_HttpServer.Run(TestMode); } + + void RequestExit() { m_HttpServer.RequestExit(); } + + void Cleanup() { m_HttpServer.StopServer(); } + + void AddEndpoint(const char* Endpoint, HttpService& Service) { m_HttpServer.AddEndpoint(Endpoint, Service); } + + void AddEndpoint(const char* endpoint, std::function<void(HttpServerRequest&)> handler) + { + ZEN_UNUSED(endpoint, handler); + + ZEN_NOT_IMPLEMENTED(); + } +}; + +HttpServer::HttpServer() +{ + m_Impl = new Impl(32); +} + +HttpServer::~HttpServer() +{ + m_Impl->Cleanup(); +} + +void +HttpServer::AddEndpoint(HttpService& Service) +{ + m_Impl->AddEndpoint(Service.BaseUri(), Service); +} + +void +HttpServer::AddEndpoint(const char* endpoint, std::function<void(HttpServerRequest&)> handler) +{ + m_Impl->AddEndpoint(endpoint, handler); +} + +void +HttpServer::Initialize(int BasePort) +{ + m_Impl->Initialize(BasePort); +} + +void +HttpServer::Run(bool TestMode) +{ + m_Impl->Run(TestMode); +} + +void +HttpServer::RequestExit() +{ + m_Impl->RequestExit(); +} + +////////////////////////////////////////////////////////////////////////// + +void +HttpRequestRouter::AddPattern(const char* Id, const char* Regex) +{ + ZEN_ASSERT(m_PatternMap.find(Id) == m_PatternMap.end()); + + m_PatternMap.insert({Id, Regex}); +} + +void +HttpRequestRouter::RegisterRoute(const char* Regex, HttpRequestRouter::HandlerFunc_t&& HandlerFunc, HttpVerb SupportedVerbs) +{ + // Expand patterns + + ExtendableStringBuilder<128> ExpandedRegex; + + size_t RegexLen = strlen(Regex); + + for (size_t i = 0; i < RegexLen;) + { + bool matched = false; + + if (Regex[i] == '{' && ((i == 0) || (Regex[i - 1] != '\\'))) + { + // Might have a pattern reference - find closing brace + + for (size_t j = i + 1; j < RegexLen; ++j) + { + if (Regex[j] == '}') + { + std::string Pattern(&Regex[i + 1], j - i - 1); + + if (auto it = m_PatternMap.find(Pattern); it != m_PatternMap.end()) + { + ExpandedRegex.Append(it->second.c_str()); + } + else + { + // Default to anything goes (or should this just be an error?) + + ExpandedRegex.Append("(.+?)"); + } + + // skip ahead + i = j + 1; + + matched = true; + + break; + } + } + } + + if (!matched) + { + ExpandedRegex.Append(Regex[i++]); + } + } + + m_Handlers.emplace_back(ExpandedRegex.c_str(), SupportedVerbs, std::move(HandlerFunc), Regex); +} + +bool +HttpRequestRouter::HandleRequest(zen::HttpServerRequest& Request) +{ + const HttpVerb Verb = Request.RequestVerb(); + + std::string_view Uri = Request.RelativeUri(); + HttpRouterRequest RouterRequest(Request); + + for (const auto& Handler : m_Handlers) + { + if ((Handler.Verbs & Verb) == Verb && regex_match(begin(Uri), end(Uri), RouterRequest.m_Match, Handler.RegEx)) + { + Handler.Handler(RouterRequest); + + return true; // Route matched + } + } + + return false; // No route matched +} + +TEST_CASE("http") +{ + using namespace std::literals; + + SUBCASE("router") + { + HttpRequestRouter r; + r.AddPattern("a", "[[:alpha:]]+"); + r.RegisterRoute( + "{a}", + [&](auto) {}, + HttpVerb::kGet); + + // struct TestHttpServerRequest : public HttpServerRequest + //{ + // TestHttpServerRequest(std::string_view Uri) : m_uri{Uri} {} + //}; + + // TestHttpServerRequest req{}; + // r.HandleRequest(req); + } +} + +void +http_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/include/zencore/atomic.h b/zencore/include/zencore/atomic.h new file mode 100644 index 000000000..457128bd4 --- /dev/null +++ b/zencore/include/zencore/atomic.h @@ -0,0 +1,43 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <intrin.h> +#include <cinttypes> + +namespace zen { + +inline uint32_t +AtomicIncrement(volatile uint32_t& value) +{ + return _InterlockedIncrement((long volatile*)&value); +} +inline uint32_t +AtomicDecrement(volatile uint32_t& value) +{ + return _InterlockedDecrement((long volatile*)&value); +} + +inline uint64_t +AtomicIncrement(volatile uint64_t& value) +{ + return _InterlockedIncrement64((__int64 volatile*)&value); +} +inline uint64_t +AtomicDecrement(volatile uint64_t& value) +{ + return _InterlockedDecrement64((__int64 volatile*)&value); +} + +inline uint32_t +AtomicAdd(volatile uint32_t& value, uint32_t amount) +{ + return _InterlockedExchangeAdd((long volatile*)&value, amount); +} +inline uint64_t +AtomicAdd(volatile uint64_t& value, uint64_t amount) +{ + return _InterlockedExchangeAdd64((__int64 volatile*)&value, amount); +} + +} // namespace zen diff --git a/zencore/include/zencore/blake3.h b/zencore/include/zencore/blake3.h new file mode 100644 index 000000000..1ef921c30 --- /dev/null +++ b/zencore/include/zencore/blake3.h @@ -0,0 +1,57 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <cinttypes> +#include <compare> +#include <cstring> + +namespace zen { + +class StringBuilderBase; + +/** + * BLAKE3 hash - 256 bits + */ +struct BLAKE3 +{ + uint8_t Hash[32]; + + inline auto operator<=>(const BLAKE3& rhs) const = default; + + static BLAKE3 HashMemory(const void* data, size_t byteCount); + static BLAKE3 FromHexString(const char* string); + const char* ToHexString(char* outString /* 40 characters + NUL terminator */) const; + StringBuilderBase& ToHexString(StringBuilderBase& outBuilder) const; + + static const int StringLength = 64; + typedef char String_t[StringLength + 1]; + + static BLAKE3 Zero; // Initialized to all zeroes + + struct Hasher + { + size_t operator()(const BLAKE3& v) const + { + size_t h; + memcpy(&h, v.Hash, sizeof h); + return h; + } + }; +}; + +struct BLAKE3Stream +{ + BLAKE3Stream(); + + void Reset(); /// Begin streaming hash compute (not needed on freshly constructed instance) + BLAKE3Stream& Append(const void* data, size_t byteCount); /// Append another chunk + BLAKE3 GetHash(); /// Obtain final hash. If you wish to reuse the instance call reset() + +private: + alignas(16) uint8_t m_HashState[2048]; +}; + +void blake3_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/compactbinary.h b/zencore/include/zencore/compactbinary.h new file mode 100644 index 000000000..c2d276c21 --- /dev/null +++ b/zencore/include/zencore/compactbinary.h @@ -0,0 +1,1335 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/enumflags.h> +#include <zencore/intmath.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/memory.h> +#include <zencore/meta.h> +#include <zencore/sharedbuffer.h> +#include <zencore/uid.h> +#include <zencore/varint.h> + +#include <functional> +#include <memory> +#include <string> +#include <string_view> +#include <type_traits> +#include <vector> + +#include <gsl/gsl-lite.hpp> + +namespace zen { + +class CbObjectView; +class CbArrayView; +class BinaryReader; +class BinaryWriter; + +class DateTime +{ +public: + explicit DateTime(uint64_t InTicks) : Ticks(InTicks) {} + inline DateTime(int Year, int Month, int Day, int Hours = 0, int Minutes = 0, int Seconds = 0, int MilliSeconds = 0) + { + Set(Year, Month, Day, Hours, Minutes, Seconds, MilliSeconds); + } + + inline uint64_t GetTicks() const { return Ticks; } + inline bool operator==(const DateTime& Rhs) const { return Ticks == Rhs.Ticks; } + inline auto operator<=>(const DateTime& Rhs) const { return Ticks - Rhs.Ticks; } + +private: + void Set(int Year, int Month, int Day, int Hours, int Minutes, int Seconds, int MilliSecond); + uint64_t Ticks; +}; + +class TimeSpan +{ +public: + explicit TimeSpan(uint64_t InTicks) : Ticks(InTicks) {} + inline TimeSpan(int Hours, int Minutes, int Seconds) { Set(0, Hours, Minutes, Seconds, 0); } + inline TimeSpan(int Days, int Hours, int Minutes, int Seconds) { Set(Days, Hours, Minutes, Seconds, 0); } + inline TimeSpan(int Days, int Hours, int Minutes, int Seconds, int Nanos) { Set(Days, Hours, Minutes, Seconds, Nanos); } + + inline uint64_t GetTicks() const { return Ticks; } + inline bool operator==(const TimeSpan& Rhs) const { return Ticks == Rhs.Ticks; } + inline auto operator<=>(const TimeSpan& Rhs) const { return Ticks - Rhs.Ticks; } + + /** + * Time span related constants. + */ + + /** The maximum number of ticks that can be represented in FTimespan. */ + static constexpr int64_t MaxTicks = 9223372036854775807; + + /** The minimum number of ticks that can be represented in FTimespan. */ + static constexpr int64_t MinTicks = -9223372036854775807 - 1; + + /** The number of nanoseconds per tick. */ + static constexpr int64_t NanosecondsPerTick = 100; + + /** The number of timespan ticks per day. */ + static constexpr int64_t TicksPerDay = 864000000000; + + /** The number of timespan ticks per hour. */ + static constexpr int64_t TicksPerHour = 36000000000; + + /** The number of timespan ticks per microsecond. */ + static constexpr int64_t TicksPerMicrosecond = 10; + + /** The number of timespan ticks per millisecond. */ + static constexpr int64_t TicksPerMillisecond = 10000; + + /** The number of timespan ticks per minute. */ + static constexpr int64_t TicksPerMinute = 600000000; + + /** The number of timespan ticks per second. */ + static constexpr int64_t TicksPerSecond = 10000000; + + /** The number of timespan ticks per week. */ + static constexpr int64_t TicksPerWeek = 6048000000000; + + /** The number of timespan ticks per year (365 days, not accounting for leap years). */ + static constexpr int64_t TicksPerYear = 365 * TicksPerDay; + +private: + void Set(int Days, int Hours, int Minutes, int Seconds, int FractionNano); + + uint64_t Ticks; +}; + +struct Guid +{ + uint32_t A, B, C, D; +}; + +////////////////////////////////////////////////////////////////////////// + +/** + * Field types and flags for CbField. + * + * This is a private type and is only declared here to enable inline use below. + * + * DO NOT CHANGE THE VALUE OF ANY MEMBERS OF THIS ENUM! + * BACKWARD COMPATIBILITY REQUIRES THAT THESE VALUES BE FIXED! + * SERIALIZATION USES HARD-CODED CONSTANTS BASED ON THESE VALUES! + */ +enum class CbFieldType : uint8_t +{ + /** A field type that does not occur in a valid object. */ + None = 0x00, + + /** Null. Payload is empty. */ + Null = 0x01, + + /** + * Object is an array of fields with unique non-empty names. + * + * Payload is a VarUInt byte count for the encoded fields followed by the fields. + */ + Object = 0x02, + /** + * UniformObject is an array of fields with the same field types and unique non-empty names. + * + * Payload is a VarUInt byte count for the encoded fields followed by the fields. + */ + UniformObject = 0x03, + + /** + * Array is an array of fields with no name that may be of different types. + * + * Payload is a VarUInt byte count, followed by a VarUInt item count, followed by the fields. + */ + Array = 0x04, + /** + * UniformArray is an array of fields with no name and with the same field type. + * + * Payload is a VarUInt byte count, followed by a VarUInt item count, followed by field type, + * followed by the fields without their field type. + */ + UniformArray = 0x05, + + /** Binary. Payload is a VarUInt byte count followed by the data. */ + Binary = 0x06, + + /** String in UTF-8. Payload is a VarUInt byte count then an unterminated UTF-8 string. */ + String = 0x07, + + /** + * Non-negative integer with the range of a 64-bit unsigned integer. + * + * Payload is the value encoded as a VarUInt. + */ + IntegerPositive = 0x08, + /** + * Negative integer with the range of a 64-bit signed integer. + * + * Payload is the ones' complement of the value encoded as a VarUInt. + */ + IntegerNegative = 0x09, + + /** Single precision float. Payload is one big endian IEEE 754 binary32 float. */ + Float32 = 0x0a, + /** Double precision float. Payload is one big endian IEEE 754 binary64 float. */ + Float64 = 0x0b, + + /** Boolean false value. Payload is empty. */ + BoolFalse = 0x0c, + /** Boolean true value. Payload is empty. */ + BoolTrue = 0x0d, + + /** + * CompactBinaryAttachment is a reference to a compact binary attachment stored externally. + * + * Payload is a 160-bit hash digest of the referenced compact binary data. + */ + CompactBinaryAttachment = 0x0e, + /** + * BinaryAttachment is a reference to a binary attachment stored externally. + * + * Payload is a 160-bit hash digest of the referenced binary data. + */ + BinaryAttachment = 0x0f, + + /** Hash. Payload is a 160-bit hash digest. */ + Hash = 0x10, + /** UUID/GUID. Payload is a 128-bit UUID as defined by RFC 4122. */ + Uuid = 0x11, + + /** + * Date and time between 0001-01-01 00:00:00.0000000 and 9999-12-31 23:59:59.9999999. + * + * Payload is a big endian int64 count of 100ns ticks since 0001-01-01 00:00:00.0000000. + */ + DateTime = 0x12, + /** + * Difference between two date/time values. + * + * Payload is a big endian int64 count of 100ns ticks in the span, and may be negative. + */ + TimeSpan = 0x13, + + /** + * Object ID + * + * Payload is a 12-byte opaque identifier + */ + ObjectId = 0x14, + + /** + * CustomById identifies the sub-type of its payload by an integer identifier. + * + * Payload is a VarUInt byte count of the sub-type identifier and the sub-type payload, followed + * by a VarUInt of the sub-type identifier then the payload of the sub-type. + */ + CustomById = 0x1e, + /** + * CustomByType identifies the sub-type of its payload by a string identifier. + * + * Payload is a VarUInt byte count of the sub-type identifier and the sub-type payload, followed + * by a VarUInt byte count of the unterminated sub-type identifier, then the sub-type identifier + * without termination, then the payload of the sub-type. + */ + CustomByName = 0x1f, + + /** Reserved for future use as a flag. Do not add types in this range. */ + Reserved = 0x20, + + /** + * A transient flag which indicates that the object or array containing this field has stored + * the field type before the payload and name. Non-uniform objects and fields will set this. + * + * Note: Since the flag must never be serialized, this bit may be repurposed in the future. + */ + HasFieldType = 0x40, + + /** A persisted flag which indicates that the field has a name stored before the payload. */ + HasFieldName = 0x80, +}; + +ENUM_CLASS_FLAGS(CbFieldType); + +/** Functions that operate on CbFieldType. */ +class CbFieldTypeOps +{ + static constexpr CbFieldType SerializedTypeMask = CbFieldType(0b1011'1111); + static constexpr CbFieldType TypeMask = CbFieldType(0b0011'1111); + static constexpr CbFieldType ObjectMask = CbFieldType(0b0011'1110); + static constexpr CbFieldType ObjectBase = CbFieldType(0b0000'0010); + static constexpr CbFieldType ArrayMask = CbFieldType(0b0011'1110); + static constexpr CbFieldType ArrayBase = CbFieldType(0b0000'0100); + static constexpr CbFieldType IntegerMask = CbFieldType(0b0011'1110); + static constexpr CbFieldType IntegerBase = CbFieldType(0b0000'1000); + static constexpr CbFieldType FloatMask = CbFieldType(0b0011'1100); + static constexpr CbFieldType FloatBase = CbFieldType(0b0000'1000); + static constexpr CbFieldType BoolMask = CbFieldType(0b0011'1110); + static constexpr CbFieldType BoolBase = CbFieldType(0b0000'1100); + static constexpr CbFieldType AttachmentMask = CbFieldType(0b0011'1110); + static constexpr CbFieldType AttachmentBase = CbFieldType(0b0000'1110); + + static void StaticAssertTypeConstants(); + +public: + /** The type with flags removed. */ + static constexpr inline CbFieldType GetType(CbFieldType Type) { return Type & TypeMask; } + /** The type with transient flags removed. */ + static constexpr inline CbFieldType GetSerializedType(CbFieldType Type) { return Type & SerializedTypeMask; } + + static constexpr inline bool HasFieldType(CbFieldType Type) { return EnumHasAnyFlags(Type, CbFieldType::HasFieldType); } + static constexpr inline bool HasFieldName(CbFieldType Type) { return EnumHasAnyFlags(Type, CbFieldType::HasFieldName); } + + static constexpr inline bool IsNone(CbFieldType Type) { return GetType(Type) == CbFieldType::None; } + static constexpr inline bool IsNull(CbFieldType Type) { return GetType(Type) == CbFieldType::Null; } + + static constexpr inline bool IsObject(CbFieldType Type) { return (Type & ObjectMask) == ObjectBase; } + static constexpr inline bool IsArray(CbFieldType Type) { return (Type & ArrayMask) == ArrayBase; } + + static constexpr inline bool IsBinary(CbFieldType Type) { return GetType(Type) == CbFieldType::Binary; } + static constexpr inline bool IsString(CbFieldType Type) { return GetType(Type) == CbFieldType::String; } + + static constexpr inline bool IsInteger(CbFieldType Type) { return (Type & IntegerMask) == IntegerBase; } + /** Whether the field is a float, or integer due to implicit conversion. */ + static constexpr inline bool IsFloat(CbFieldType Type) { return (Type & FloatMask) == FloatBase; } + static constexpr inline bool IsBool(CbFieldType Type) { return (Type & BoolMask) == BoolBase; } + + static constexpr inline bool IsCompactBinaryAttachment(CbFieldType Type) + { + return GetType(Type) == CbFieldType::CompactBinaryAttachment; + } + static constexpr inline bool IsBinaryAttachment(CbFieldType Type) { return GetType(Type) == CbFieldType::BinaryAttachment; } + static constexpr inline bool IsAttachment(CbFieldType Type) { return (Type & AttachmentMask) == AttachmentBase; } + + static constexpr inline bool IsHash(CbFieldType Type) + { + switch (GetType(Type)) + { + case CbFieldType::Hash: + case CbFieldType::BinaryAttachment: + case CbFieldType::CompactBinaryAttachment: + return true; + default: + return false; + } + } + + static constexpr inline bool IsUuid(CbFieldType Type) { return GetType(Type) == CbFieldType::Uuid; } + static constexpr inline bool IsObjectId(CbFieldType Type) { return GetType(Type) == CbFieldType::ObjectId; } + + static constexpr inline bool IsDateTime(CbFieldType Type) { return GetType(Type) == CbFieldType::DateTime; } + static constexpr inline bool IsTimeSpan(CbFieldType Type) { return GetType(Type) == CbFieldType::TimeSpan; } + + /** Whether the type is or may contain fields of any attachment type. */ + static constexpr inline bool MayContainAttachments(CbFieldType Type) + { + // The use of !! will suppress V792 from static analysis. Using //-V792 did not work. + return !!IsObject(Type) | !!IsArray(Type) | !!IsAttachment(Type); + } +}; + +/** Errors that can occur when accessing a field. */ +enum class CbFieldError : uint8_t +{ + /** The field is not in an error state. */ + None, + /** The value type does not match the requested type. */ + TypeError, + /** The value is out of range for the requested type. */ + RangeError, +}; + +class ICbVisitor +{ +public: + virtual void SetName(std::string_view Name) = 0; + virtual void BeginObject() = 0; + virtual void EndObject() = 0; + virtual void BeginArray() = 0; + virtual void EndArray() = 0; + virtual void VisitNull() = 0; + virtual void VisitBinary(SharedBuffer Value) = 0; + virtual void VisitString(std::string_view Value) = 0; + virtual void VisitInteger(int64_t Value) = 0; + virtual void VisitInteger(uint64_t Value) = 0; + virtual void VisitFloat(float Value) = 0; + virtual void VisitDouble(double Value) = 0; + virtual void VisitBool(bool value) = 0; + virtual void VisitCbAttachment(const IoHash& Value) = 0; + virtual void VisitBinaryAttachment(const IoHash& Value) = 0; + virtual void VisitHash(const IoHash& Value) = 0; + virtual void VisitUuid(const Guid& Value) = 0; + virtual void VisitObjectId(const Oid& Value) = 0; + virtual void VisitDateTime(DateTime Value) = 0; + virtual void VisitTimeSpan(TimeSpan Value) = 0; +}; + +/** + * An atom of data in the compact binary format. + * + * Accessing the value of a field is always a safe operation, even if accessed as the wrong type. + * An invalid access will return a default value for the requested type, and set an error code on + * the field that can be checked with GetLastError and HasLastError. A valid access will clear an + * error from a previous invalid access. + * + * A field is encoded in one or more bytes, depending on its type and the type of object or array + * that contains it. A field of an object or array which is non-uniform encodes its field type in + * the first byte, and includes the HasFieldName flag for a field in an object. The field name is + * encoded in a variable-length unsigned integer of its size in bytes, for named fields, followed + * by that many bytes of the UTF-8 encoding of the name with no null terminator. The remainder of + * the field is the payload and is described in the field type enum. Every field must be uniquely + * addressable when encoded, which means a zero-byte field is not permitted, and only arises in a + * uniform array of fields with no payload, where the answer is to encode as a non-uniform array. + * + * This type only provides a view into memory and does not perform any memory management itself. + * Use CbFieldRef to hold a reference to the underlying memory when necessary. + */ + +class CbFieldView +{ +public: + CbFieldView() = default; + + ZENCORE_API CbFieldView(const void* DataPointer, CbFieldType FieldType = CbFieldType::HasFieldType); + + /** Returns the name of the field if it has a name, otherwise an empty view. */ + constexpr inline std::string_view GetName() const { return std::string_view(static_cast<const char*>(Payload) - NameLen, NameLen); } + + ZENCORE_API MemoryView AsBinaryView(MemoryView Default = MemoryView()); + ZENCORE_API CbObjectView AsObjectView(); + ZENCORE_API CbArrayView AsArrayView(); + ZENCORE_API std::string_view AsString(std::string_view Default = std::string_view()); + + ZENCORE_API void IterateAttachments(std::function<void(CbFieldView)> Visitor) const; + + /** Access the field as an int8. Returns the provided default on error. */ + inline int8_t AsInt8(int8_t Default = 0) { return AsInteger<int8_t>(Default); } + /** Access the field as an int16. Returns the provided default on error. */ + inline int16_t AsInt16(int16_t Default = 0) { return AsInteger<int16_t>(Default); } + /** Access the field as an int32. Returns the provided default on error. */ + inline int32_t AsInt32(int32_t Default = 0) { return AsInteger<int32_t>(Default); } + /** Access the field as an int64. Returns the provided default on error. */ + inline int64_t AsInt64(int64_t Default = 0) { return AsInteger<int64_t>(Default); } + /** Access the field as a uint8. Returns the provided default on error. */ + inline uint8_t AsUInt8(uint8_t Default = 0) { return AsInteger<uint8_t>(Default); } + /** Access the field as a uint16. Returns the provided default on error. */ + inline uint16_t AsUInt16(uint16_t Default = 0) { return AsInteger<uint16_t>(Default); } + /** Access the field as a uint32. Returns the provided default on error. */ + inline uint32_t AsUInt32(uint32_t Default = 0) { return AsInteger<uint32_t>(Default); } + /** Access the field as a uint64. Returns the provided default on error. */ + inline uint64_t AsUInt64(uint64_t Default = 0) { return AsInteger<uint64_t>(Default); } + + /** Access the field as a float. Returns the provided default on error. */ + ZENCORE_API float AsFloat(float Default = 0.0f); + /** Access the field as a double. Returns the provided default on error. */ + ZENCORE_API double AsDouble(double Default = 0.0); + + /** Access the field as a bool. Returns the provided default on error. */ + ZENCORE_API bool AsBool(bool bDefault = false); + + /** Access the field as a hash referencing a compact binary attachment. Returns the provided default on error. */ + ZENCORE_API IoHash AsCompactBinaryAttachment(const IoHash& Default = IoHash()); + /** Access the field as a hash referencing a binary attachment. Returns the provided default on error. */ + ZENCORE_API IoHash AsBinaryAttachment(const IoHash& Default = IoHash()); + /** Access the field as a hash referencing an attachment. Returns the provided default on error. */ + ZENCORE_API IoHash AsAttachment(const IoHash& Default = IoHash()); + + /** Access the field as a hash. Returns the provided default on error. */ + ZENCORE_API IoHash AsHash(const IoHash& Default = IoHash()); + + /** Access the field as a UUID. Returns a nil UUID on error. */ + ZENCORE_API Guid AsUuid(); + /** Access the field as a UUID. Returns the provided default on error. */ + ZENCORE_API Guid AsUuid(const Guid& Default); + + /** Access the field as an OID. Returns a nil OID on error. */ + ZENCORE_API Oid AsObjectId(); + /** Access the field as a OID. Returns the provided default on error. */ + ZENCORE_API Oid AsObjectId(const Oid& Default); + + /** Access the field as a date/time tick count. Returns the provided default on error. */ + ZENCORE_API int64_t AsDateTimeTicks(int64_t Default = 0); + + /** Access the field as a date/time. Returns a date/time at the epoch on error. */ + ZENCORE_API DateTime AsDateTime(); + /** Access the field as a date/time. Returns the provided default on error. */ + ZENCORE_API DateTime AsDateTime(DateTime Default); + + /** Access the field as a timespan tick count. Returns the provided default on error. */ + ZENCORE_API int64_t AsTimeSpanTicks(int64_t Default = 0); + + /** Access the field as a timespan. Returns an empty timespan on error. */ + ZENCORE_API TimeSpan AsTimeSpan(); + /** Access the field as a timespan. Returns the provided default on error. */ + ZENCORE_API TimeSpan AsTimeSpan(TimeSpan Default); + + /** True if the field has a name. */ + constexpr inline bool HasName() const { return CbFieldTypeOps::HasFieldName(Type); } + + constexpr inline bool IsNull() const { return CbFieldTypeOps::IsNull(Type); } + + constexpr inline bool IsObject() const { return CbFieldTypeOps::IsObject(Type); } + constexpr inline bool IsArray() const { return CbFieldTypeOps::IsArray(Type); } + + constexpr inline bool IsBinary() const { return CbFieldTypeOps::IsBinary(Type); } + constexpr inline bool IsString() const { return CbFieldTypeOps::IsString(Type); } + + /** Whether the field is an integer of unspecified range and sign. */ + constexpr inline bool IsInteger() const { return CbFieldTypeOps::IsInteger(Type); } + /** Whether the field is a float, or integer that supports implicit conversion. */ + constexpr inline bool IsFloat() const { return CbFieldTypeOps::IsFloat(Type); } + constexpr inline bool IsBool() const { return CbFieldTypeOps::IsBool(Type); } + + constexpr inline bool IsCompactBinaryAttachment() const { return CbFieldTypeOps::IsCompactBinaryAttachment(Type); } + constexpr inline bool IsBinaryAttachment() const { return CbFieldTypeOps::IsBinaryAttachment(Type); } + constexpr inline bool IsAttachment() const { return CbFieldTypeOps::IsAttachment(Type); } + + constexpr inline bool IsHash() const { return CbFieldTypeOps::IsHash(Type); } + constexpr inline bool IsUuid() const { return CbFieldTypeOps::IsUuid(Type); } + constexpr inline bool IsObjectId() const { return CbFieldTypeOps::IsObjectId(Type); } + + constexpr inline bool IsDateTime() const { return CbFieldTypeOps::IsDateTime(Type); } + constexpr inline bool IsTimeSpan() const { return CbFieldTypeOps::IsTimeSpan(Type); } + + /** Whether the field has a value. */ + constexpr inline explicit operator bool() const { return HasValue(); } + + /** + * Whether the field has a value. + * + * All fields in a valid object or array have a value. A field with no value is returned when + * finding a field by name fails or when accessing an iterator past the end. + */ + constexpr inline bool HasValue() const { return !CbFieldTypeOps::IsNone(Type); }; + + /** Whether the last field access encountered an error. */ + constexpr inline bool HasError() const { return Error != CbFieldError::None; } + + /** The type of error that occurred on the last field access, or None. */ + constexpr inline CbFieldError GetError() const { return Error; } + + /** Returns the size of the field in bytes, including the type and name. */ + ZENCORE_API uint64_t GetSize() const; + + /** Calculate the hash of the field, including the type and name. */ + ZENCORE_API IoHash GetHash() const; + + ZENCORE_API void GetHash(IoHashStream& HashStream) const; + + /** Feed the field (including type and name) to the stream function */ + inline void WriteToStream(auto Hash) const + { + const CbFieldType SerializedType = CbFieldTypeOps::GetSerializedType(Type); + Hash(&SerializedType, sizeof(SerializedType)); + auto View = GetViewNoType(); + Hash(View.GetData(), View.GetSize()); + } + + /** Copy the field into a buffer of exactly GetSize() bytes, including the type and name. */ + ZENCORE_API void CopyTo(MutableMemoryView Buffer) const; + + /** Copy the field into an archive, including its type and name. */ + ZENCORE_API void CopyTo(BinaryWriter& Ar) const; + + /** + * Whether this field is identical to the other field. + * + * Performs a deep comparison of any contained arrays or objects and their fields. Comparison + * assumes that both fields are valid and are written in the canonical format. Fields must be + * written in the same order in arrays and objects, and name comparison is case sensitive. If + * these assumptions do not hold, this may return false for equivalent inputs. Validation can + * be performed with ValidateCompactBinary, except for field order and field name case. + */ + ZENCORE_API bool Equals(const CbFieldView& Other) const; + + /** Returns a view of the field, including the type and name when present. */ + ZENCORE_API MemoryView GetView() const; + + /** + * Try to get a view of the field as it would be serialized, such as by CopyTo. + * + * A serialized view is not available if the field has an externally-provided type. + * Access the serialized form of such fields using CopyTo or FCbFieldRef::Clone. + */ + inline bool TryGetSerializedView(MemoryView& OutView) const + { + if (CbFieldTypeOps::HasFieldType(Type)) + { + OutView = GetView(); + return true; + } + return false; + } + +protected: + /** Returns a view of the name and value payload, which excludes the type. */ + ZENCORE_API MemoryView GetViewNoType() const; + + /** Returns a view of the value payload, which excludes the type and name. */ + inline MemoryView GetPayloadView() const { return MemoryView(Payload, GetPayloadSize()); } + + /** Returns the type of the field including flags. */ + constexpr inline CbFieldType GetType() const { return Type; } + + /** Returns the start of the value payload. */ + constexpr inline const void* GetPayload() const { return Payload; } + + /** Returns the end of the value payload. */ + inline const void* GetPayloadEnd() const { return static_cast<const uint8_t*>(Payload) + GetPayloadSize(); } + + /** Returns the size of the value payload in bytes, which is the field excluding the type and name. */ + ZENCORE_API uint64_t GetPayloadSize() const; + + /** Assign a field from a pointer to its data and an optional externally-provided type. */ + inline void Assign(const void* InData, const CbFieldType InType) + { + static_assert(std::is_trivially_destructible<CbFieldView>::value, + "This optimization requires CbField to be trivially destructible!"); + new (this) CbFieldView(InData, InType); + } + +private: + /** Parameters for converting to an integer. */ + struct IntegerParams + { + /** Whether the output type has a sign bit. */ + uint32_t IsSigned : 1; + /** Bits of magnitude. (7 for int8) */ + uint32_t MagnitudeBits : 31; + }; + + /** Make integer params for the given integer type. */ + template<typename IntType> + static constexpr inline IntegerParams MakeIntegerParams() + { + IntegerParams Params; + Params.IsSigned = IntType(-1) < IntType(0); + Params.MagnitudeBits = 8 * sizeof(IntType) - Params.IsSigned; + return Params; + } + + /** + * Access the field as the given integer type. + * + * Returns the provided default if the value cannot be represented in the output type. + */ + template<typename IntType> + inline IntType AsInteger(IntType Default) + { + return IntType(AsInteger(uint64_t(Default), MakeIntegerParams<IntType>())); + } + + ZENCORE_API uint64_t AsInteger(uint64_t Default, IntegerParams Params); + + /** The field type, with the transient HasFieldType flag if the field contains its type. */ + CbFieldType Type = CbFieldType::None; + /** The error (if any) that occurred on the last field access. */ + CbFieldError Error = CbFieldError::None; + /** The number of bytes for the name stored before the payload. */ + uint32_t NameLen = 0; + /** The value payload, which also points to the end of the name. */ + const void* Payload = nullptr; +}; + +template<typename FieldType> +class TCbFieldIterator : public FieldType +{ +public: + /** Construct an empty field range. */ + constexpr TCbFieldIterator() = default; + + inline TCbFieldIterator& operator++() + { + const void* const PayloadEnd = FieldType::GetPayloadEnd(); + const int64_t AtEndMask = int64_t(PayloadEnd == FieldsEnd) - 1; + const CbFieldType NextType = CbFieldType(int64_t(FieldType::GetType()) & AtEndMask); + const void* const NextField = reinterpret_cast<const void*>(int64_t(PayloadEnd) & AtEndMask); + const void* const NextFieldsEnd = reinterpret_cast<const void*>(int64_t(FieldsEnd) & AtEndMask); + + FieldType::Assign(NextField, NextType); + FieldsEnd = NextFieldsEnd; + return *this; + } + + inline TCbFieldIterator operator++(int) + { + TCbFieldIterator It(*this); + ++*this; + return It; + } + + constexpr inline FieldType& operator*() { return *this; } + constexpr inline FieldType* operator->() { return this; } + + /** Reset this to an empty field range. */ + inline void Reset() { *this = TCbFieldIterator(); } + + /** Returns the size of the fields in the range in bytes. */ + ZENCORE_API uint64_t GetRangeSize() const; + + /** Calculate the hash of every field in the range. */ + ZENCORE_API IoHash GetRangeHash() const; + ZENCORE_API void GetRangeHash(IoHashStream& Hash) const; + + using FieldType::Equals; + + template<typename OtherFieldType> + constexpr inline bool Equals(const TCbFieldIterator<OtherFieldType>& Other) const + { + return FieldType::GetPayload() == Other.OtherFieldType::GetPayload() && FieldsEnd == Other.FieldsEnd; + } + + template<typename OtherFieldType> + constexpr inline bool operator==(const TCbFieldIterator<OtherFieldType>& Other) const + { + return Equals(Other); + } + + template<typename OtherFieldType> + constexpr inline bool operator!=(const TCbFieldIterator<OtherFieldType>& Other) const + { + return !Equals(Other); + } + + /** Copy the field range into a buffer of exactly GetRangeSize() bytes. */ + ZENCORE_API void CopyRangeTo(MutableMemoryView Buffer) const; + + /** Invoke the visitor for every attachment in the field range. */ + ZENCORE_API void IterateRangeAttachments(std::function<void(CbFieldView)> Visitor) const; + + /** Create a view of every field in the range. */ + inline MemoryView GetRangeView() const { return MemoryView(FieldType::GetView().GetData(), FieldsEnd); } + + /** + * Try to get a view of every field in the range as they would be serialized. + * + * A serialized view is not available if the underlying fields have an externally-provided type. + * Access the serialized form of such ranges using CbFieldRefIterator::CloneRange. + */ + inline bool TryGetSerializedRangeView(MemoryView& OutView) const + { + if (CbFieldTypeOps::HasFieldType(FieldType::GetType())) + { + OutView = GetRangeView(); + return true; + } + return false; + } + +protected: + /** Construct a field range that contains exactly one field. */ + constexpr inline explicit TCbFieldIterator(FieldType InField) : FieldType(std::move(InField)), FieldsEnd(FieldType::GetPayloadEnd()) {} + + /** + * Construct a field range from the first field and a pointer to the end of the last field. + * + * @param InField The first field, or the default field if there are no fields. + * @param InFieldsEnd A pointer to the end of the payload of the last field, or null. + */ + constexpr inline TCbFieldIterator(FieldType&& InField, const void* InFieldsEnd) : FieldType(std::move(InField)), FieldsEnd(InFieldsEnd) + { + } + + /** Returns the end of the last field, or null for an iterator at the end. */ + template<typename OtherFieldType> + static inline const void* GetFieldsEnd(const TCbFieldIterator<OtherFieldType>& It) + { + return It.FieldsEnd; + } + +private: + friend inline TCbFieldIterator begin(const TCbFieldIterator& Iterator) { return Iterator; } + friend inline TCbFieldIterator end(const TCbFieldIterator&) { return TCbFieldIterator(); } + +private: + template<typename OtherType> + friend class TCbFieldIterator; + + friend class CbFieldViewIterator; + + /** Pointer to the first byte past the end of the last field. Set to null at the end. */ + const void* FieldsEnd = nullptr; +}; + +/** + * Iterator for CbField. + * + * @see CbFieldIterator + */ +class CbFieldViewIterator : public TCbFieldIterator<CbFieldView> +{ +public: + constexpr CbFieldViewIterator() = default; + + /** Construct a field range that contains exactly one field. */ + static inline CbFieldViewIterator MakeSingle(const CbFieldView& Field) { return CbFieldViewIterator(Field); } + + /** + * Construct a field range from a buffer containing zero or more valid fields. + * + * @param View A buffer containing zero or more valid fields. + * @param Type HasFieldType means that View contains the type. Otherwise, use the given type. + */ + static inline CbFieldViewIterator MakeRange(MemoryView View, CbFieldType Type = CbFieldType::HasFieldType) + { + return !View.IsEmpty() ? TCbFieldIterator(CbFieldView(View.GetData(), Type), View.GetDataEnd()) : CbFieldViewIterator(); + } + + /** Construct an iterator from another iterator. */ + template<typename OtherFieldType> + inline CbFieldViewIterator(const TCbFieldIterator<OtherFieldType>& It) + : TCbFieldIterator(ImplicitConv<CbFieldView>(It), GetFieldsEnd(It)) + { + } + +private: + using TCbFieldIterator::TCbFieldIterator; +}; + +/** + * Array of CbField that have no names. + * + * Accessing a field of the array requires iteration. Access by index is not provided because the + * cost of accessing an item by index scales linearly with the index. + * + * This type only provides a view into memory and does not perform any memory management itself. + * Use CbArrayRef to hold a reference to the underlying memory when necessary. + */ +class CbArrayView : protected CbFieldView +{ + friend class CbFieldView; + +public: + /** @see CbField::CbField */ + using CbFieldView::CbFieldView; + + /** Construct an array with no fields. */ + ZENCORE_API CbArrayView(); + + /** Returns the number of items in the array. */ + ZENCORE_API uint64_t Num() const; + + /** Create an iterator for the fields of this array. */ + ZENCORE_API CbFieldViewIterator CreateViewIterator() const; + + /** Visit the fields of this array. */ + ZENCORE_API void VisitFields(ICbVisitor& Visitor); + + /** Access the array as an array field. */ + inline CbFieldView AsFieldView() const { return static_cast<const CbFieldView&>(*this); } + + /** Construct an array from an array field. No type check is performed! */ + static inline CbArrayView FromFieldView(const CbFieldView& Field) { return CbArrayView(Field); } + + /** Returns the size of the array in bytes if serialized by itself with no name. */ + ZENCORE_API uint64_t GetSize() const; + + /** Calculate the hash of the array if serialized by itself with no name. */ + ZENCORE_API IoHash GetHash() const; + + ZENCORE_API void GetHash(IoHashStream& Stream) const; + + /** + * Whether this array is identical to the other array. + * + * Performs a deep comparison of any contained arrays or objects and their fields. Comparison + * assumes that both fields are valid and are written in the canonical format. Fields must be + * written in the same order in arrays and objects, and name comparison is case sensitive. If + * these assumptions do not hold, this may return false for equivalent inputs. Validation can + * be done with the All mode to check these assumptions about the format of the inputs. + */ + ZENCORE_API bool Equals(const CbArrayView& Other) const; + + /** Copy the array into a buffer of exactly GetSize() bytes, with no name. */ + ZENCORE_API void CopyTo(MutableMemoryView Buffer) const; + + /** Copy the array into an archive, including its type and name. */ + ZENCORE_API void CopyTo(BinaryWriter& Ar) const; + + ///** Invoke the visitor for every attachment in the array. */ + inline void IterateAttachments(std::function<void(CbFieldView)> Visitor) const + { + CreateViewIterator().IterateRangeAttachments(Visitor); + } + + /** Returns a view of the array, including the type and name when present. */ + using CbFieldView::GetView; + +private: + friend inline CbFieldViewIterator begin(const CbArrayView& Array) { return Array.CreateViewIterator(); } + friend inline CbFieldViewIterator end(const CbArrayView&) { return CbFieldViewIterator(); } + + /** Construct an array from an array field. No type check is performed! Use via FromField. */ + inline explicit CbArrayView(const CbFieldView& Field) : CbFieldView(Field) {} +}; + +class CbObjectView : protected CbFieldView +{ + friend class CbFieldView; + +public: + /** @see CbField::CbField */ + using CbFieldView::CbFieldView; + + /** Construct an object with no fields. */ + ZENCORE_API CbObjectView(); + + /** Create an iterator for the fields of this object. */ + ZENCORE_API CbFieldViewIterator CreateViewIterator() const; + + /** Visit the fields of this object. */ + ZENCORE_API void VisitFields(ICbVisitor& Visitor); + + /** + * Find a field by case-sensitive name comparison. + * + * The cost of this operation scales linearly with the number of fields in the object. Prefer + * to iterate over the fields only once when consuming an object. + * + * @param Name The name of the field. + * @return The matching field if found, otherwise a field with no value. + */ + ZENCORE_API CbFieldView FindView(std::string_view Name) const; + + /** Find a field by case-insensitive name comparison. */ + ZENCORE_API CbFieldView FindViewIgnoreCase(std::string_view Name) const; + + /** Find a field by case-sensitive name comparison. */ + inline CbFieldView operator[](std::string_view Name) const { return FindView(Name); } + + /** Access the object as an object field. */ + inline CbFieldView AsFieldView() const { return static_cast<const CbFieldView&>(*this); } + + /** Construct an object from an object field. No type check is performed! */ + static inline CbObjectView FromFieldView(const CbFieldView& Field) { return CbObjectView(Field); } + + /** Returns the size of the object in bytes if serialized by itself with no name. */ + ZENCORE_API uint64_t GetSize() const; + + /** Calculate the hash of the object if serialized by itself with no name. */ + ZENCORE_API IoHash GetHash() const; + + ZENCORE_API void GetHash(IoHashStream& HashStream) const; + + /** + * Whether this object is identical to the other object. + * + * Performs a deep comparison of any contained arrays or objects and their fields. Comparison + * assumes that both fields are valid and are written in the canonical format. Fields must be + * written in the same order in arrays and objects, and name comparison is case sensitive. If + * these assumptions do not hold, this may return false for equivalent inputs. Validation can + * be done with the All mode to check these assumptions about the format of the inputs. + */ + ZENCORE_API bool Equals(const CbObjectView& Other) const; + + /** Copy the object into a buffer of exactly GetSize() bytes, with no name. */ + ZENCORE_API void CopyTo(MutableMemoryView Buffer) const; + + /** Copy the field into an archive, including its type and name. */ + ZENCORE_API void CopyTo(BinaryWriter& Ar) const; + + ///** Invoke the visitor for every attachment in the object. */ + inline void IterateAttachments(std::function<void(CbFieldView)> Visitor) const + { + CreateViewIterator().IterateRangeAttachments(Visitor); + } + + /** Returns a view of the object, including the type and name when present. */ + using CbFieldView::GetView; + + /** Whether the field has a value. */ + using CbFieldView::operator bool; + +private: + friend inline CbFieldViewIterator begin(const CbObjectView& Object) { return Object.CreateViewIterator(); } + friend inline CbFieldViewIterator end(const CbObjectView&) { return CbFieldViewIterator(); } + + /** Construct an object from an object field. No type check is performed! Use via FromField. */ + inline explicit CbObjectView(const CbFieldView& Field) : CbFieldView(Field) {} +}; + +////////////////////////////////////////////////////////////////////////// + +/** A reference to a function that is used to allocate buffers for compact binary data. */ +using BufferAllocator = std::function<UniqueBuffer(uint64_t Size)>; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** A wrapper that holds a reference to the buffer that contains its compact binary value. */ +template<typename BaseType> +class CbBuffer : public BaseType +{ +public: + /** Construct a default value. */ + CbBuffer() = default; + + /** + * Construct a value from a pointer to its data and an optional externally-provided type. + * + * @param ValueBuffer A buffer that exactly contains the value. + * @param Type HasFieldType means that ValueBuffer contains the type. Otherwise, use the given type. + */ + inline explicit CbBuffer(SharedBuffer ValueBuffer, CbFieldType Type = CbFieldType::HasFieldType) + { + if (ValueBuffer) + { + BaseType::operator=(BaseType(ValueBuffer.GetData(), Type)); + ZEN_ASSERT(ValueBuffer.GetView().Contains(BaseType::GetView())); + Buffer = std::move(ValueBuffer); + } + } + + /** Construct a value that holds a reference to the buffer that contains it. */ + inline CbBuffer(const BaseType& Value, SharedBuffer OuterBuffer) : BaseType(Value) + { + if (OuterBuffer) + { + ZEN_ASSERT(OuterBuffer.GetView().Contains(BaseType::GetView())); + Buffer = std::move(OuterBuffer); + } + } + + /** Construct a value that holds a reference to the buffer of the outer that contains it. */ + template<typename OtherBaseType> + inline CbBuffer(const BaseType& Value, CbBuffer<OtherBaseType> OuterRef) : CbBuffer(Value, std::move(OuterRef.Buffer)) + { + } + + /** Reset this to a default value and null buffer. */ + inline void Reset() { *this = CbBuffer(); } + + /** Whether this reference has ownership of the memory in its buffer. */ + inline bool IsOwned() const { return Buffer && Buffer.IsOwned(); } + + /** Clone the value, if necessary, to a buffer that this reference has ownership of. */ + inline void MakeOwned() + { + if (!IsOwned()) + { + UniqueBuffer MutableBuffer = UniqueBuffer::Alloc(BaseType::GetSize()); + BaseType::CopyTo(MutableBuffer); + BaseType::operator=(BaseType(MutableBuffer.GetData())); + Buffer = std::move(MutableBuffer); + } + } + + /** Returns a buffer that exactly contains this value. */ + inline SharedBuffer GetBuffer() const + { + const MemoryView View = BaseType::GetView(); + const SharedBuffer& OuterBuffer = GetOuterBuffer(); + return View == OuterBuffer.GetView() ? OuterBuffer : SharedBuffer::MakeView(View, OuterBuffer); + } + + /** Returns the outer buffer (if any) that contains this value. */ + inline const SharedBuffer& GetOuterBuffer() const& { return Buffer; } + inline SharedBuffer GetOuterBuffer() && { return std::move(Buffer); } + +private: + template<typename OtherType> + friend class CbBuffer; + + SharedBuffer Buffer; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Factory functions for types derived from CbBuffer. + * + * This uses the curiously recurring template pattern to construct the correct type of reference. + * The derived type inherits from CbBufferRef and this type to expose the factory functions. + */ +template<typename RefType, typename BaseType> +class CbBufferFactory +{ +public: + /** Construct a value from an owned clone of its memory. */ + static inline RefType Clone(const void* const Data) { return Clone(BaseType(Data)); } + + /** Construct a value from an owned clone of its memory. */ + static inline RefType Clone(const BaseType& Value) + { + RefType Ref = MakeView(Value); + Ref.MakeOwned(); + return Ref; + } + + /** Construct a value from a read-only view of its memory and its optional outer buffer. */ + static inline RefType MakeView(const void* const Data, SharedBuffer OuterBuffer = SharedBuffer()) + { + return MakeView(BaseType(Data), std::move(OuterBuffer)); + } + + /** Construct a value from a read-only view of its memory and its optional outer buffer. */ + static inline RefType MakeView(const BaseType& Value, SharedBuffer OuterBuffer = SharedBuffer()) + { + return RefType(Value, std::move(OuterBuffer)); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +class CbArray; +class CbObject; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * A field that can hold a reference to the memory that contains it. + * + * @see CbBufferRef + */ +class CbField : public CbBuffer<CbFieldView>, public CbBufferFactory<CbField, CbFieldView> +{ +public: + using CbBuffer::CbBuffer; + + /** Access the field as an object. Defaults to an empty object on error. */ + inline CbObject AsObject() &; + + /** Access the field as an object. Defaults to an empty object on error. */ + inline CbObject AsObject() &&; + + /** Access the field as an array. Defaults to an empty array on error. */ + inline CbArray AsArray() &; + + /** Access the field as an array. Defaults to an empty array on error. */ + inline CbArray AsArray() &&; +}; + +/** + * Iterator for CbFieldRef. + * + * @see CbFieldIterator + */ +class CbFieldIterator : public TCbFieldIterator<CbField> +{ +public: + /** Construct a field range from an owned clone of a range. */ + ZENCORE_API static CbFieldIterator CloneRange(const CbFieldViewIterator& It); + + /** Construct a field range from an owned clone of a range. */ + static inline CbFieldIterator CloneRange(const CbFieldIterator& It) { return CloneRange(CbFieldViewIterator(It)); } + + /** Construct a field range that contains exactly one field. */ + static inline CbFieldIterator MakeSingle(CbField Field) { return CbFieldIterator(std::move(Field)); } + + /** + * Construct a field range from a buffer containing zero or more valid fields. + * + * @param Buffer A buffer containing zero or more valid fields. + * @param Type HasFieldType means that Buffer contains the type. Otherwise, use the given type. + */ + static inline CbFieldIterator MakeRange(SharedBuffer Buffer, CbFieldType Type = CbFieldType::HasFieldType) + { + if (Buffer.GetSize()) + { + const void* const DataEnd = Buffer.GetView().GetDataEnd(); + return CbFieldIterator(CbField(std::move(Buffer), Type), DataEnd); + } + return CbFieldIterator(); + } + + /** Construct a field range from an iterator and its optional outer buffer. */ + static inline CbFieldIterator MakeRangeView(const CbFieldViewIterator& It, SharedBuffer OuterBuffer = SharedBuffer()) + { + return CbFieldIterator(CbField(It, std::move(OuterBuffer)), GetFieldsEnd(It)); + } + + /** Construct an empty field range. */ + constexpr CbFieldIterator() = default; + + /** Clone the range, if necessary, to a buffer that this reference has ownership of. */ + inline void MakeRangeOwned() + { + if (!IsOwned()) + { + *this = CloneRange(*this); + } + } + + /** Returns a buffer that exactly contains the field range. */ + ZENCORE_API SharedBuffer GetRangeBuffer() const; + +private: + using TCbFieldIterator::TCbFieldIterator; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * An array that can hold a reference to the memory that contains it. + * + * @see CbBuffer + */ +class CbArray : public CbBuffer<CbArrayView>, public CbBufferFactory<CbArray, CbArrayView> +{ +public: + using CbBuffer::CbBuffer; + + /** Create an iterator for the fields of this array. */ + inline CbFieldIterator CreateIterator() const { return CbFieldIterator::MakeRangeView(CreateViewIterator(), GetOuterBuffer()); } + + /** Access the array as an array field. */ + inline CbField AsField() const& { return CbField(CbArrayView::AsFieldView(), *this); } + + /** Access the array as an array field. */ + inline CbField AsField() && { return CbField(CbArrayView::AsFieldView(), std::move(*this)); } + +private: + friend inline CbFieldIterator begin(const CbArray& Array) { return Array.CreateIterator(); } + friend inline CbFieldIterator end(const CbArray&) { return CbFieldIterator(); } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * An object that can hold a reference to the memory that contains it. + * + * @see CbBuffer + */ +class CbObject : public CbBuffer<CbObjectView>, public CbBufferFactory<CbObject, CbObjectView> +{ +public: + using CbBuffer::CbBuffer; + + /** Create an iterator for the fields of this object. */ + inline CbFieldIterator CreateIterator() const { return CbFieldIterator::MakeRangeView(CreateViewIterator(), GetOuterBuffer()); } + + /** Find a field by case-sensitive name comparison. */ + inline CbField Find(std::string_view Name) const + { + if (CbFieldView Field = FindView(Name)) + { + return CbField(Field, *this); + } + return CbField(); + } + + /** Find a field by case-insensitive name comparison. */ + inline CbField FindIgnoreCase(std::string_view Name) const + { + if (CbFieldView Field = FindIgnoreCase(Name)) + { + return CbField(Field, *this); + } + return CbField(); + } + + /** Find a field by case-sensitive name comparison. */ + inline CbFieldView operator[](std::string_view Name) const { return Find(Name); } + + /** Access the object as an object field. */ + inline CbField AsField() const& { return CbField(CbObjectView::AsFieldView(), *this); } + + /** Access the object as an object field. */ + inline CbField AsField() && { return CbField(CbObjectView::AsFieldView(), std::move(*this)); } + +private: + friend inline CbFieldIterator begin(const CbObject& Object) { return Object.CreateIterator(); } + friend inline CbFieldIterator end(const CbObject&) { return CbFieldIterator(); } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +inline CbObject +CbField::AsObject() & +{ + return IsObject() ? CbObject(AsObjectView(), *this) : CbObject(); +} + +inline CbObject +CbField::AsObject() && +{ + return IsObject() ? CbObject(AsObjectView(), std::move(*this)) : CbObject(); +} + +inline CbArray +CbField::AsArray() & +{ + return IsArray() ? CbArray(AsArrayView(), *this) : CbArray(); +} + +inline CbArray +CbField::AsArray() && +{ + return IsArray() ? CbArray(AsArrayView(), std::move(*this)) : CbArray(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +ZENCORE_API CbField LoadCompactBinary(BinaryReader& Ar, BufferAllocator Allocator); + +inline CbObject +LoadCompactBinaryObject(IoBuffer Payload) +{ + return CbObject{SharedBuffer::MakeView(Payload.Data(), Payload.Size())}; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Determine the size in bytes of the compact binary field at the start of the view. + * + * This may be called on an incomplete or invalid field, in which case the returned size is zero. + * A size can always be extracted from a valid field with no name if a view of at least the first + * 10 bytes is provided, regardless of field size. For fields with names, the size of view needed + * to calculate a size is at most 10 + MaxNameLen + MeasureVarUInt(MaxNameLen). + * + * This function can be used when streaming a field, for example, to determine the size of buffer + * to fill before attempting to construct a field from it. + * + * @param View A memory view that may contain the start of a field. + * @param Type HasFieldType means that View contains the type. Otherwise, use the given type. + */ +ZENCORE_API uint64_t MeasureCompactBinary(MemoryView View, CbFieldType Type = CbFieldType::HasFieldType); + +/** + * Try to determine the type and size of the compact binary field at the start of the view. + * + * This may be called on an incomplete or invalid field, in which case it will return false, with + * OutSize being 0 for invalid fields, otherwise the minimum view size necessary to make progress + * in measuring the field on the next call to this function. + * + * @note A return of true from this function does not indicate that the entire field is valid. + * + * @param InView A memory view that may contain the start of a field. + * @param OutType The type (with flags) of the field. None is written until a value is available. + * @param OutSize The total field size for a return of true, 0 for invalid fields, or the size to + * make progress in measuring the field on the next call to this function. + * @param InType HasFieldType means that InView contains the type. Otherwise, use the given type. + * @return true if the size of the field was determined, otherwise false. + */ +ZENCORE_API bool TryMeasureCompactBinary(MemoryView InView, + CbFieldType& OutType, + uint64_t& OutSize, + CbFieldType InType = CbFieldType::HasFieldType); + +inline CbFieldViewIterator +begin(CbFieldView& View) +{ + if (View.IsArray()) + { + return View.AsArrayView().CreateViewIterator(); + } + else if (View.IsObject()) + { + return View.AsObjectView().CreateViewIterator(); + } + + return CbFieldViewIterator(); +} + +inline CbFieldViewIterator +end(CbFieldView&) +{ + return CbFieldViewIterator(); +} + +void uson_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/compactbinarybuilder.h b/zencore/include/zencore/compactbinarybuilder.h new file mode 100644 index 000000000..83d4309f7 --- /dev/null +++ b/zencore/include/zencore/compactbinarybuilder.h @@ -0,0 +1,633 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/compactbinary.h> + +#include <zencore/enumflags.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/refcount.h> +#include <zencore/sha1.h> + +#include <atomic> +#include <memory> +#include <string> +#include <string_view> +#include <type_traits> +#include <vector> + +#include <gsl/gsl-lite.hpp> + +namespace zen { + +class CbAttachment; +class BinaryWriter; + +/** + * A writer for compact binary object, arrays, and fields. + * + * The writer produces a sequence of fields that can be saved to a provided memory buffer or into + * a new owned buffer. The typical use case is to write a single object, which can be accessed by + * calling Save().AsObjectRef() or Save(Buffer).AsObject(). + * + * The writer will assert on most incorrect usage and will always produce valid compact binary if + * provided with valid input. The writer does not check for invalid UTF-8 string encoding, object + * fields with duplicate names, or invalid compact binary being copied from another source. + * + * It is most convenient to use the streaming API for the writer, as demonstrated in the example. + * + * When writing a small amount of compact binary data, TCbWriter can be more efficient as it uses + * a fixed-size stack buffer for storage before spilling onto the heap. + * + * @see TCbWriter + * + * Example: + * + * CbObjectRef WriteObject() + * { + * CbWriter<256> Writer; + * Writer.BeginObject(); + * + * Writer << "Resize" << true; + * Writer << "MaxWidth" << 1024; + * Writer << "MaxHeight" << 1024; + * + * Writer.BeginArray(); + * Writer << "FormatA" << "FormatB" << "FormatC"; + * Writer.EndArray(); + * + * Writer.EndObject(); + * return Writer.Save().AsObjectRef(); + * } + */ +class CbWriter +{ +public: + ZENCORE_API CbWriter(); + ZENCORE_API ~CbWriter(); + + CbWriter(const CbWriter&) = delete; + CbWriter& operator=(const CbWriter&) = delete; + + /** Empty the writer without releasing any allocated memory. */ + ZENCORE_API void Reset(); + + /** + * Serialize the field(s) to an owned buffer and return it as an iterator. + * + * It is not valid to call this function in the middle of writing an object, array, or field. + * The writer remains valid for further use when this function returns. + */ + ZENCORE_API CbFieldIterator Save(); + + /** + * Serialize the field(s) to memory. + * + * It is not valid to call this function in the middle of writing an object, array, or field. + * The writer remains valid for further use when this function returns. + * + * @param Buffer A mutable memory view to write to. Must be exactly GetSaveSize() bytes. + * @return An iterator for the field(s) written to the buffer. + */ + ZENCORE_API CbFieldViewIterator Save(MutableMemoryView Buffer); + + ZENCORE_API void Save(BinaryWriter& Writer); + + /** + * The size of buffer (in bytes) required to serialize the fields that have been written. + * + * It is not valid to call this function in the middle of writing an object, array, or field. + */ + ZENCORE_API uint64_t GetSaveSize() const; + + /** + * Sets the name of the next field to be written. + * + * It is not valid to call this function when writing a field inside an array. + * Names must be valid UTF-8 and must be unique within an object. + */ + ZENCORE_API CbWriter& SetName(std::string_view Name); + + /** Copy the value (not the name) of an existing field. */ + inline void AddField(std::string_view Name, const CbFieldView& Value) + { + SetName(Name); + AddField(Value); + } + + ZENCORE_API void AddField(const CbFieldView& Value); + + /** Copy the value (not the name) of an existing field. Holds a reference if owned. */ + inline void AddField(std::string_view Name, const CbField& Value) + { + SetName(Name); + AddField(Value); + } + ZENCORE_API void AddField(const CbField& Value); + + /** Begin a new object. Must have a matching call to EndObject. */ + inline void BeginObject(std::string_view Name) + { + SetName(Name); + BeginObject(); + } + ZENCORE_API void BeginObject(); + /** End an object after its fields have been written. */ + ZENCORE_API void EndObject(); + + /** Copy the value (not the name) of an existing object. */ + inline void AddObject(std::string_view Name, const CbObjectView& Value) + { + SetName(Name); + AddObject(Value); + } + ZENCORE_API void AddObject(const CbObjectView& Value); + /** Copy the value (not the name) of an existing object. Holds a reference if owned. */ + inline void AddObject(std::string_view Name, const CbObject& Value) + { + SetName(Name); + AddObject(Value); + } + ZENCORE_API void AddObject(const CbObject& Value); + + /** Begin a new array. Must have a matching call to EndArray. */ + inline void BeginArray(std::string_view Name) + { + SetName(Name); + BeginArray(); + } + ZENCORE_API void BeginArray(); + /** End an array after its fields have been written. */ + ZENCORE_API void EndArray(); + + /** Copy the value (not the name) of an existing array. */ + inline void AddArray(std::string_view Name, const CbArrayView& Value) + { + SetName(Name); + AddArray(Value); + } + ZENCORE_API void AddArray(const CbArrayView& Value); + /** Copy the value (not the name) of an existing array. Holds a reference if owned. */ + inline void AddArray(std::string_view Name, const CbArray& Value) + { + SetName(Name); + AddArray(Value); + } + ZENCORE_API void AddArray(const CbArray& Value); + + /** Write a null field. */ + inline void AddNull(std::string_view Name) + { + SetName(Name); + AddNull(); + } + ZENCORE_API void AddNull(); + + /** Write a binary field by copying Size bytes from Value. */ + inline void AddBinary(std::string_view Name, const void* Value, uint64_t Size) + { + SetName(Name); + AddBinary(Value, Size); + } + ZENCORE_API void AddBinary(const void* Value, uint64_t Size); + /** Write a binary field by copying the view. */ + inline void AddBinary(std::string_view Name, MemoryView Value) + { + SetName(Name); + AddBinary(Value); + } + inline void AddBinary(MemoryView Value) { AddBinary(Value.GetData(), Value.GetSize()); } + + /** Write a binary field by copying the buffer. Holds a reference if owned. */ + inline void AddBinary(std::string_view Name, IoBuffer Value) + { + SetName(Name); + AddBinary(std::move(Value)); + } + ZENCORE_API void AddBinary(IoBuffer Value); + ZENCORE_API void AddBinary(SharedBuffer Value); + + /** Write a string field by copying the UTF-8 value. */ + inline void AddString(std::string_view Name, std::string_view Value) + { + SetName(Name); + AddString(Value); + } + ZENCORE_API void AddString(std::string_view Value); + /** Write a string field by converting the UTF-16 value to UTF-8. */ + inline void AddString(std::string_view Name, std::wstring_view Value) + { + SetName(Name); + AddString(Value); + } + ZENCORE_API void AddString(std::wstring_view Value); + + /** Write an integer field. */ + inline void AddInteger(std::string_view Name, int32_t Value) + { + SetName(Name); + AddInteger(Value); + } + ZENCORE_API void AddInteger(int32_t Value); + /** Write an integer field. */ + inline void AddInteger(std::string_view Name, int64_t Value) + { + SetName(Name); + AddInteger(Value); + } + ZENCORE_API void AddInteger(int64_t Value); + /** Write an integer field. */ + inline void AddInteger(std::string_view Name, uint32_t Value) + { + SetName(Name); + AddInteger(Value); + } + ZENCORE_API void AddInteger(uint32_t Value); + /** Write an integer field. */ + inline void AddInteger(std::string_view Name, uint64_t Value) + { + SetName(Name); + AddInteger(Value); + } + ZENCORE_API void AddInteger(uint64_t Value); + + /** Write a float field from a 32-bit float value. */ + inline void AddFloat(std::string_view Name, float Value) + { + SetName(Name); + AddFloat(Value); + } + ZENCORE_API void AddFloat(float Value); + + /** Write a float field from a 64-bit float value. */ + inline void AddFloat(std::string_view Name, double Value) + { + SetName(Name); + AddFloat(Value); + } + ZENCORE_API void AddFloat(double Value); + + /** Write a bool field. */ + inline void AddBool(std::string_view Name, bool bValue) + { + SetName(Name); + AddBool(bValue); + } + ZENCORE_API void AddBool(bool bValue); + + /** Write a field referencing a compact binary attachment by its hash. */ + inline void AddCompactBinaryAttachment(std::string_view Name, const IoHash& Value) + { + SetName(Name); + AddCompactBinaryAttachment(Value); + } + ZENCORE_API void AddCompactBinaryAttachment(const IoHash& Value); + + /** Write a field referencing a binary attachment by its hash. */ + inline void AddBinaryAttachment(std::string_view Name, const IoHash& Value) + { + SetName(Name); + AddBinaryAttachment(Value); + } + ZENCORE_API void AddBinaryAttachment(const IoHash& Value); + + /** Write a field referencing the attachment by its hash. */ + inline void AddAttachment(std::string_view Name, const CbAttachment& Attachment) + { + SetName(Name); + AddAttachment(Attachment); + } + ZENCORE_API void AddAttachment(const CbAttachment& Attachment); + + /** Write a hash field. */ + inline void AddHash(std::string_view Name, const IoHash& Value) + { + SetName(Name); + AddHash(Value); + } + ZENCORE_API void AddHash(const IoHash& Value); + + /** Write a UUID field. */ + inline void AddUuid(std::string_view Name, const Guid& Value) + { + SetName(Name); + AddUuid(Value); + } + ZENCORE_API void AddUuid(const Guid& Value); + + /** Write an ObjectId field. */ + inline void AddObjectId(std::string_view Name, const Oid& Value) + { + SetName(Name); + AddObjectId(Value); + } + ZENCORE_API void AddObjectId(const Oid& Value); + + /** Write a date/time field with the specified count of 100ns ticks since the epoch. */ + inline void AddDateTimeTicks(std::string_view Name, int64_t Ticks) + { + SetName(Name); + AddDateTimeTicks(Ticks); + } + ZENCORE_API void AddDateTimeTicks(int64_t Ticks); + + /** Write a date/time field. */ + inline void AddDateTime(std::string_view Name, DateTime Value) + { + SetName(Name); + AddDateTime(Value); + } + ZENCORE_API void AddDateTime(DateTime Value); + + /** Write a time span field with the specified count of 100ns ticks. */ + inline void AddTimeSpanTicks(std::string_view Name, int64_t Ticks) + { + SetName(Name); + AddTimeSpanTicks(Ticks); + } + ZENCORE_API void AddTimeSpanTicks(int64_t Ticks); + + /** Write a time span field. */ + inline void AddTimeSpan(std::string_view Name, TimeSpan Value) + { + SetName(Name); + AddTimeSpan(Value); + } + ZENCORE_API void AddTimeSpan(TimeSpan Value); + + /** Private flags that are public to work with ENUM_CLASS_FLAGS. */ + enum class StateFlags : uint8_t; + +protected: + /** Reserve the specified size up front until the format is optimized. */ + ZENCORE_API explicit CbWriter(int64_t InitialSize); + +private: + friend CbWriter& operator<<(CbWriter& Writer, std::string_view NameOrValue); + + /** Begin writing a field. May be called twice for named fields. */ + void BeginField(); + + /** Finish writing a field by writing its type. */ + void EndField(CbFieldType Type); + + /** Set the field name if valid in this state, otherwise write add a string field. */ + ZENCORE_API void SetNameOrAddString(std::string_view NameOrValue); + + /** Returns a view of the name of the active field, if any, otherwise the empty view. */ + std::string_view GetActiveName() const; + + /** Remove field types after the first to make the sequence uniform. */ + void MakeFieldsUniform(int64_t FieldBeginOffset, int64_t FieldEndOffset); + + /** State of the object, array, or top-level field being written. */ + struct WriterState + { + StateFlags Flags{}; + /** The type of the fields in the sequence if uniform, otherwise None. */ + CbFieldType UniformType{}; + /** The offset of the start of the current field. */ + int64_t Offset{}; + /** The number of fields written in this state. */ + uint64_t Count{}; + }; + +private: + // This is a prototype-quality format for the writer. Using an array of bytes is inefficient, + // and will lead to many unnecessary copies and moves of the data to resize the array, insert + // object and array sizes, and remove field types for uniform objects and uniform arrays. The + // optimized format will be a list of power-of-two blocks and an optional first block that is + // provided externally, such as on the stack. That format will store the offsets that require + // object or array sizes to be inserted and field types to be removed, and will perform those + // operations only when saving to a buffer. + std::vector<uint8_t> Data; + std::vector<WriterState> States; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * A writer for compact binary object, arrays, and fields that uses a fixed-size stack buffer. + * + * @see CbWriter + */ +template<uint32_t InlineBufferSize> +class FixedCbWriter : public CbWriter +{ +public: + inline FixedCbWriter() : CbWriter(InlineBufferSize) {} + + FixedCbWriter(const FixedCbWriter&) = delete; + FixedCbWriter& operator=(const FixedCbWriter&) = delete; + +private: + // Reserve the inline buffer now even though we are unable to use it. This will avoid causing + // new stack overflows when this functionality is properly implemented in the future. + uint8_t Buffer[InlineBufferSize]; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +class CbObjectWriter : public CbWriter +{ +public: + CbObjectWriter() { BeginObject(); } + + ZENCORE_API CbObject Save() + { + Finalize(); + return CbWriter::Save().AsObject(); + } + + ZENCORE_API void Save(BinaryWriter& Writer) + { + Finalize(); + return CbWriter::Save(Writer); + } + + uint64_t GetSaveSize() = delete; + + void Finalize() + { + if (m_Finalized == false) + { + EndObject(); + m_Finalized = true; + } + } + + CbObjectWriter(const CbWriter&) = delete; + CbObjectWriter& operator=(const CbWriter&) = delete; + +private: + bool m_Finalized = false; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** Write the field name if valid in this state, otherwise write the string value. */ +inline CbWriter& +operator<<(CbWriter& Writer, std::string_view NameOrValue) +{ + Writer.SetNameOrAddString(NameOrValue); + return Writer; +} + +/** Write the field name if valid in this state, otherwise write the string value. */ +inline CbWriter& +operator<<(CbWriter& Writer, const char* NameOrValue) +{ + return Writer << std::string_view(NameOrValue); +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbFieldView& Value) +{ + Writer.AddField(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbField& Value) +{ + Writer.AddField(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbObjectView& Value) +{ + Writer.AddObject(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbObject& Value) +{ + Writer.AddObject(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbArrayView& Value) +{ + Writer.AddArray(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbArray& Value) +{ + Writer.AddArray(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, nullptr_t) +{ + Writer.AddNull(); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, std::wstring_view Value) +{ + Writer.AddString(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const wchar_t* Value) +{ + Writer.AddString(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, int32_t Value) +{ + Writer.AddInteger(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, int64_t Value) +{ + Writer.AddInteger(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, uint32_t Value) +{ + Writer.AddInteger(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, uint64_t Value) +{ + Writer.AddInteger(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, float Value) +{ + Writer.AddFloat(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, double Value) +{ + Writer.AddFloat(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, bool Value) +{ + Writer.AddBool(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const CbAttachment& Attachment) +{ + Writer.AddAttachment(Attachment); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const IoHash& Value) +{ + Writer.AddHash(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const Guid& Value) +{ + Writer.AddUuid(Value); + return Writer; +} + +inline CbWriter& +operator<<(CbWriter& Writer, const Oid& Value) +{ + Writer.AddObjectId(Value); + return Writer; +} + +ZENCORE_API CbWriter& operator<<(CbWriter& Writer, DateTime Value); +ZENCORE_API CbWriter& operator<<(CbWriter& Writer, TimeSpan Value); + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void usonbuilder_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/compactbinarypackage.h b/zencore/include/zencore/compactbinarypackage.h new file mode 100644 index 000000000..c98ab047f --- /dev/null +++ b/zencore/include/zencore/compactbinarypackage.h @@ -0,0 +1,305 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/compactbinary.h> +#include <zencore/iohash.h> + +#include <functional> +#include <span> + +namespace zen { + +class CbWriter; +class BinaryReader; +class BinaryWriter; +class IoBuffer; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * An attachment is either binary or compact binary and is identified by its hash. + * + * A compact binary attachment is also a valid binary attachment and may be accessed as binary. + * + * Attachments are serialized as one or two compact binary fields with no name. A Binary field is + * written first with its content. The content hash is omitted when the content size is zero, and + * is otherwise written as a BinaryReference or CompactBinaryReference depending on the type. + */ +class CbAttachment +{ +public: + /** Construct a null attachment. */ + CbAttachment() = default; + + /** Construct a compact binary attachment. Value is cloned if not owned. */ + inline explicit CbAttachment(CbFieldIterator Value) : CbAttachment(std::move(Value), nullptr) {} + + /** Construct a compact binary attachment. Value is cloned if not owned. Hash must match Value. */ + inline explicit CbAttachment(CbFieldIterator Value, const IoHash& Hash) : CbAttachment(std::move(Value), &Hash) {} + + /** Construct a binary attachment. Value is cloned if not owned. */ + inline explicit CbAttachment(SharedBuffer Value) : CbAttachment(std::move(Value), nullptr) {} + + /** Construct a binary attachment. Value is cloned if not owned. Hash must match Value. */ + inline explicit CbAttachment(SharedBuffer Value, const IoHash& Hash) : CbAttachment(std::move(Value), &Hash) {} + + /** Reset this to a null attachment. */ + inline void Reset() { *this = CbAttachment(); } + + /** Whether the attachment has a value. */ + inline explicit operator bool() const { return !IsNull(); } + + /** Whether the attachment has a value. */ + inline bool IsNull() const { return !Buffer; } + + /** Access the attachment as binary. Defaults to a null buffer on error. */ + ZENCORE_API SharedBuffer AsBinaryView() const; + + /** Access the attachment as compact binary. Defaults to a field iterator with no value on error. */ + ZENCORE_API CbFieldIterator AsCompactBinary() const; + + /** Returns whether the attachment is binary or compact binary. */ + inline bool IsBinary() const { return !Buffer.IsNull(); } + + /** Returns whether the attachment is compact binary. */ + inline bool IsCompactBinary() const { return CompactBinary.HasValue(); } + + /** Returns the hash of the attachment value. */ + inline const IoHash& GetHash() const { return Hash; } + + /** Compares attachments by their hash. Any discrepancy in type must be handled externally. */ + inline bool operator==(const CbAttachment& Attachment) const { return Hash == Attachment.Hash; } + inline bool operator!=(const CbAttachment& Attachment) const { return Hash != Attachment.Hash; } + inline bool operator<(const CbAttachment& Attachment) const { return Hash < Attachment.Hash; } + + /** + * Load the attachment from compact binary as written by Save. + * + * The attachment references the input iterator if it is owned, and otherwise clones the value. + * + * The iterator is advanced as attachment fields are consumed from it. + */ + ZENCORE_API void Load(CbFieldIterator& Fields); + + /** + * Load the attachment from compact binary as written by Save. + */ + ZENCORE_API void Load(BinaryReader& Reader, BufferAllocator Allocator = UniqueBuffer::Alloc); + + /** + * Load the attachment from compact binary as written by Save. + */ + ZENCORE_API void Load(IoBuffer& Buffer, BufferAllocator Allocator = UniqueBuffer::Alloc); + + /** Save the attachment into the writer as a stream of compact binary fields. */ + ZENCORE_API void Save(CbWriter& Writer) const; + + /** Save the attachment into the writer as a stream of compact binary fields. */ + ZENCORE_API void Save(BinaryWriter& Writer) const; + +private: + ZENCORE_API CbAttachment(CbFieldIterator Value, const IoHash* Hash); + ZENCORE_API CbAttachment(SharedBuffer Value, const IoHash* Hash); + + /** An owned buffer containing the binary or compact binary data. */ + SharedBuffer Buffer; + /** A field iterator that is valid only for compact binary attachments. */ + CbFieldViewIterator CompactBinary; + /** A hash of the attachment value. */ + IoHash Hash; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * A package is a compact binary object with attachments for its external references. + * + * A package is basically a Merkle tree with compact binary as its root and other non-leaf nodes, + * and either binary or compact binary as its leaf nodes. A node references its child nodes using + * BinaryHash or FieldHash fields in its compact binary representation. + * + * It is invalid for a package to include attachments that are not referenced by its object or by + * one of its referenced compact binary attachments. When attachments are added explicitly, it is + * the responsibility of the package creator to follow this requirement. Attachments that are not + * referenced may not survive a round-trip through certain storage systems. + * + * It is valid for a package to exclude referenced attachments, but then it is the responsibility + * of the package consumer to have a mechanism for resolving those references when necessary. + * + * A package is serialized as a sequence of compact binary fields with no name. The object may be + * both preceded and followed by attachments. The object itself is written as an Object field and + * followed by its hash in a CompactBinaryReference field when the object is non-empty. A package + * ends with a Null field. The canonical order of components is the object and its hash, followed + * by the attachments ordered by hash, followed by a Null field. It is valid for the a package to + * have its components serialized in any order, provided there is at most one object and the null + * field is written last. + */ +class CbPackage +{ +public: + /** + * A function that resolves a hash to a buffer containing the data matching that hash. + * + * The resolver may return a null buffer to skip resolving an attachment for the hash. + */ + using AttachmentResolver = std::function<SharedBuffer(const IoHash& Hash)>; + + /** Construct a null package. */ + CbPackage() = default; + + /** + * Construct a package from a root object without gathering attachments. + * + * @param InObject The root object, which will be cloned unless it is owned. + */ + inline explicit CbPackage(CbObject InObject) { SetObject(std::move(InObject)); } + + /** + * Construct a package from a root object and gather attachments using the resolver. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InResolver A function that is invoked for every reference and binary reference field. + */ + inline explicit CbPackage(CbObject InObject, AttachmentResolver InResolver) { SetObject(std::move(InObject), InResolver); } + + /** + * Construct a package from a root object without gathering attachments. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InObjectHash The hash of the object, which must match to avoid validation errors. + */ + inline explicit CbPackage(CbObject InObject, const IoHash& InObjectHash) { SetObject(std::move(InObject), InObjectHash); } + + /** + * Construct a package from a root object and gather attachments using the resolver. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InObjectHash The hash of the object, which must match to avoid validation errors. + * @param InResolver A function that is invoked for every reference and binary reference field. + */ + inline explicit CbPackage(CbObject InObject, const IoHash& InObjectHash, AttachmentResolver InResolver) + { + SetObject(std::move(InObject), InObjectHash, InResolver); + } + + /** Reset this to a null package. */ + inline void Reset() { *this = CbPackage(); } + + /** Whether the package has a non-empty object or attachments. */ + inline explicit operator bool() const { return !IsNull(); } + + /** Whether the package has an empty object and no attachments. */ + inline bool IsNull() const { return !Object.CreateIterator() && Attachments.size() == 0; } + + /** Returns the compact binary object for the package. */ + inline const CbObject& GetObject() const { return Object; } + + /** Returns the has of the compact binary object for the package. */ + inline const IoHash& GetObjectHash() const { return ObjectHash; } + + /** + * Set the root object without gathering attachments. + * + * @param InObject The root object, which will be cloned unless it is owned. + */ + inline void SetObject(CbObject InObject) { SetObject(std::move(InObject), nullptr, nullptr); } + + /** + * Set the root object and gather attachments using the resolver. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InResolver A function that is invoked for every reference and binary reference field. + */ + inline void SetObject(CbObject InObject, AttachmentResolver InResolver) { SetObject(std::move(InObject), nullptr, &InResolver); } + + /** + * Set the root object without gathering attachments. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InObjectHash The hash of the object, which must match to avoid validation errors. + */ + inline void SetObject(CbObject InObject, const IoHash& InObjectHash) { SetObject(std::move(InObject), &InObjectHash, nullptr); } + + /** + * Set the root object and gather attachments using the resolver. + * + * @param InObject The root object, which will be cloned unless it is owned. + * @param InObjectHash The hash of the object, which must match to avoid validation errors. + * @param InResolver A function that is invoked for every reference and binary reference field. + */ + inline void SetObject(CbObject InObject, const IoHash& InObjectHash, AttachmentResolver InResolver) + { + SetObject(std::move(InObject), &InObjectHash, &InResolver); + } + + /** Returns the attachments in this package. */ + inline std::span<const CbAttachment> GetAttachments() const { return Attachments; } + + /** + * Find an attachment by its hash. + * + * @return The attachment, or null if the attachment is not found. + * @note The returned pointer is only valid until the attachments on this package are modified. + */ + ZENCORE_API const CbAttachment* FindAttachment(const IoHash& Hash) const; + + /** Find an attachment if it exists in the package. */ + inline const CbAttachment* FindAttachment(const CbAttachment& Attachment) const { return FindAttachment(Attachment.GetHash()); } + + /** Add the attachment to this package. */ + inline void AddAttachment(const CbAttachment& Attachment) { AddAttachment(Attachment, nullptr); } + + /** Add the attachment to this package, along with any references that can be resolved. */ + inline void AddAttachment(const CbAttachment& Attachment, AttachmentResolver Resolver) { AddAttachment(Attachment, &Resolver); } + + /** + * Remove an attachment by hash. + * + * @return Number of attachments removed, which will be either 0 or 1. + */ + ZENCORE_API int32_t RemoveAttachment(const IoHash& Hash); + inline int32_t RemoveAttachment(const CbAttachment& Attachment) { return RemoveAttachment(Attachment.GetHash()); } + + /** Compares packages by their object and attachment hashes. */ + ZENCORE_API bool Equals(const CbPackage& Package) const; + inline bool operator==(const CbPackage& Package) const { return Equals(Package); } + inline bool operator!=(const CbPackage& Package) const { return !Equals(Package); } + + /** + * Load the object and attachments from compact binary as written by Save. + * + * The object and attachments reference the input iterator, if it is owned, and otherwise clones + * the object and attachments individually to make owned copies. + * + * The iterator is advanced as object and attachment fields are consumed from it. + */ + ZENCORE_API void Load(CbFieldIterator& Fields); + + ZENCORE_API void Load(IoBuffer& Buffer, BufferAllocator Allocator = UniqueBuffer::Alloc); + + ZENCORE_API void Load(BinaryReader& Reader, BufferAllocator Allocator = UniqueBuffer::Alloc); + + /** Save the object and attachments into the writer as a stream of compact binary fields. */ + ZENCORE_API void Save(CbWriter& Writer) const; + + /** Save the object and attachments into the writer as a stream of compact binary fields. */ + ZENCORE_API void Save(BinaryWriter& Writer) const; + +private: + ZENCORE_API void SetObject(CbObject Object, const IoHash* Hash, AttachmentResolver* Resolver); + ZENCORE_API void AddAttachment(const CbAttachment& Attachment, AttachmentResolver* Resolver); + + void GatherAttachments(const CbFieldViewIterator& Fields, AttachmentResolver Resolver); + + /** Attachments ordered by their hash. */ + std::vector<CbAttachment> Attachments; + CbObject Object; + IoHash ObjectHash; +}; + +void usonpackage_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/compactbinaryvalidation.h b/zencore/include/zencore/compactbinaryvalidation.h new file mode 100644 index 000000000..3a3f432be --- /dev/null +++ b/zencore/include/zencore/compactbinaryvalidation.h @@ -0,0 +1,192 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/compactbinary.h> +#include <zencore/enumflags.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/refcount.h> +#include <zencore/sha1.h> + +#include <gsl/gsl-lite.hpp> + +namespace zen { + +/** Flags for validating compact binary data. */ +enum class CbValidateMode : uint32_t +{ + /** Skip validation if no other validation modes are enabled. */ + None = 0, + + /** + * Validate that the value can be read and stays inside the bounds of the memory view. + * + * This is the minimum level of validation required to be able to safely read a field, array, + * or object without the risk of crashing or reading out of bounds. + */ + Default = 1 << 0, + + /** + * Validate that object fields have unique non-empty names and array fields have no names. + * + * Name validation failures typically do not inhibit reading the input, but duplicated fields + * cannot be looked up by name other than the first, and converting to other data formats can + * fail in the presence of naming issues. + */ + Names = 1 << 1, + + /** + * Validate that fields are serialized in the canonical format. + * + * Format validation failures typically do not inhibit reading the input. Values that fail in + * this mode require more memory than in the canonical format, and comparisons of such values + * for equality are not reliable. Examples of failures include uniform arrays or objects that + * were not encoded uniformly, variable-length integers that could be encoded in fewer bytes, + * or 64-bit floats that could be encoded in 32 bits without loss of precision. + */ + Format = 1 << 2, + + /** + * Validate that there is no padding after the value before the end of the memory view. + * + * Padding validation failures have no impact on the ability to read the input, but are using + * more memory than necessary. + */ + Padding = 1 << 3, + + /** + * Validate that a package or attachment has the expected fields and matches its saved hashes. + */ + Package = 1 << 4, + + /** Perform all validation described above. */ + All = Default | Names | Format | Padding | Package, +}; + +ENUM_CLASS_FLAGS(CbValidateMode); + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** Flags for compact binary validation errors. Multiple flags may be combined. */ +enum class CbValidateError : uint32_t +{ + /** The input had no validation errors. */ + None = 0, + + // Mode: Default + + /** The input cannot be read without reading out of bounds. */ + OutOfBounds = 1 << 0, + /** The input has a field with an unrecognized or invalid type. */ + InvalidType = 1 << 1, + + // Mode: Names + + /** An object had more than one field with the same name. */ + DuplicateName = 1 << 2, + /** An object had a field with no name. */ + MissingName = 1 << 3, + /** An array field had a name. */ + ArrayName = 1 << 4, + + // Mode: Format + + /** A name or string payload is not valid UTF-8. */ + InvalidString = 1 << 5, + /** A size or integer payload can be encoded in fewer bytes. */ + InvalidInteger = 1 << 6, + /** A float64 payload can be encoded as a float32 without loss of precision. */ + InvalidFloat = 1 << 7, + /** An object has the same type for every field but is not uniform. */ + NonUniformObject = 1 << 8, + /** An array has the same type for every field and non-empty payloads but is not uniform. */ + NonUniformArray = 1 << 9, + + // Mode: Padding + + /** A value did not use the entire memory view given for validation. */ + Padding = 1 << 10, + + // Mode: Package + + /** The package or attachment had missing fields or fields out of order. */ + InvalidPackageFormat = 1 << 11, + /** The object or an attachment did not match the hash stored for it. */ + InvalidPackageHash = 1 << 12, + /** The package contained more than one copy of the same attachment. */ + DuplicateAttachments = 1 << 13, + /** The package contained more than one object. */ + MultiplePackageObjects = 1 << 14, + /** The package contained an object with no fields. */ + NullPackageObject = 1 << 15, + /** The package contained a null attachment. */ + NullPackageAttachment = 1 << 16, +}; + +ENUM_CLASS_FLAGS(CbValidateError); + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + * Validate the compact binary data for one field in the view as specified by the mode flags. + * + * Only one top-level field is processed from the view, and validation recurses into any array or + * object within that field. To validate multiple consecutive top-level fields, call the function + * once for each top-level field. If the given view might contain multiple top-level fields, then + * either exclude the Padding flag from the Mode or use MeasureCompactBinary to break up the view + * into its constituent fields before validating. + * + * @param View A memory view containing at least one top-level field. + * @param Mode A combination of the flags for the types of validation to perform. + * @param Type HasFieldType means that View contains the type. Otherwise, use the given type. + * @return None on success, otherwise the flags for the types of errors that were detected. + */ +ZENCORE_API CbValidateError ValidateCompactBinary(MemoryView View, CbValidateMode Mode, CbFieldType Type = CbFieldType::HasFieldType); + +/** + * Validate the compact binary data for every field in the view as specified by the mode flags. + * + * This function expects the entire view to contain fields. Any trailing region of the view which + * does not contain a valid field will produce an OutOfBounds or InvalidType error instead of the + * Padding error that would be produced by the single field validation function. + * + * @see ValidateCompactBinary + */ +ZENCORE_API CbValidateError ValidateCompactBinaryRange(MemoryView View, CbValidateMode Mode); + +/** + * Validate the compact binary attachment pointed to by the view as specified by the mode flags. + * + * The attachment is validated with ValidateCompactBinary by using the validation mode specified. + * Include ECbValidateMode::Package to validate the attachment format and hash. + * + * @see ValidateCompactBinary + * + * @param View A memory view containing a package. + * @param Mode A combination of the flags for the types of validation to perform. + * @return None on success, otherwise the flags for the types of errors that were detected. + */ +ZENCORE_API CbValidateError ValidateCompactBinaryAttachment(MemoryView View, CbValidateMode Mode); + +/** + * Validate the compact binary package pointed to by the view as specified by the mode flags. + * + * The package, and attachments, are validated with ValidateCompactBinary by using the validation + * mode specified. Include ECbValidateMode::Package to validate the package format and hashes. + * + * @see ValidateCompactBinary + * + * @param View A memory view containing a package. + * @param Mode A combination of the flags for the types of validation to perform. + * @return None on success, otherwise the flags for the types of errors that were detected. + */ +ZENCORE_API CbValidateError ValidateCompactBinaryPackage(MemoryView View, CbValidateMode Mode); + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void usonvalidation_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/compress.h b/zencore/include/zencore/compress.h new file mode 100644 index 000000000..759cf8444 --- /dev/null +++ b/zencore/include/zencore/compress.h @@ -0,0 +1,53 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore/zencore.h" + +namespace zen::CompressedBuffer { + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +static constexpr uint64_t DefaultBlockSize = 256 * 1024; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** Method used to compress the data in a compressed buffer. */ +enum class Method : uint8_t +{ + /** Header is followed by one uncompressed block. */ + None = 0, + /** Header is followed by an array of compressed block sizes then the compressed blocks. */ + LZ4 = 4, +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** Header used on every compressed buffer. Always stored in big-endian format */ +struct BufferHeader +{ + static constexpr uint32_t ExpectedMagic = 0xb7756362; + + /** A magic number to identify a compressed buffer. Always 0xb7756362 */ + uint32_t Magic = ExpectedMagic; + /** A CRC-32 used to check integrity of the buffer. Uses the polynomial 0x04c11db7 */ + uint32_t Crc32 = 0; + /** The method used to compress the buffer. Affects layout of data following the header */ + Method Method = Method::None; + /** The reserved bytes must be initialized to zero */ + uint8_t Reserved[2]{}; + /** The power of two size of every uncompressed block except the last. Size is 1 << BlockSizeExponent */ + uint8_t BlockSizeExponent = 0; + /** The number of blocks that follow the header */ + uint32_t BlockCount = 0; + /** The total size of the uncompressed data */ + uint64_t TotalRawSize = 0; + /** The total size of the compressed data including the header */ + uint64_t TotalCompressedSize = 0; +}; + +static_assert(sizeof(BufferHeader) == 32, "BufferHeader is the wrong size"); + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace zen::CompressedBuffer diff --git a/zencore/include/zencore/endian.h b/zencore/include/zencore/endian.h new file mode 100644 index 000000000..27c831bb1 --- /dev/null +++ b/zencore/include/zencore/endian.h @@ -0,0 +1,61 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +namespace zen { + +inline uint16_t +ByteSwap(uint16_t x) +{ + return _byteswap_ushort(x); +} + +inline uint32_t +ByteSwap(uint32_t x) +{ + return _byteswap_ulong(x); +} + +inline uint64_t +ByteSwap(uint64_t x) +{ + return _byteswap_uint64(x); +} + +inline uint16_t +FromNetworkOrder(uint16_t x) +{ + return ByteSwap(x); +} + +inline uint32_t +FromNetworkOrder(uint32_t x) +{ + return ByteSwap(x); +} + +inline uint64_t +FromNetworkOrder(uint64_t x) +{ + return ByteSwap(x); +} + +inline uint16_t +FromNetworkOrder(int16_t x) +{ + return ByteSwap(uint16_t(x)); +} + +inline uint32_t +FromNetworkOrder(int32_t x) +{ + return ByteSwap(uint32_t(x)); +} + +inline uint64_t +FromNetworkOrder(int64_t x) +{ + return ByteSwap(uint64_t(x)); +} + +} // namespace zen diff --git a/zencore/include/zencore/enumflags.h b/zencore/include/zencore/enumflags.h new file mode 100644 index 000000000..ebe747bf0 --- /dev/null +++ b/zencore/include/zencore/enumflags.h @@ -0,0 +1,61 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +namespace zen { + +// Enum class helpers + +// Defines all bitwise operators for enum classes so it can be (mostly) used as a regular flags enum +#define ENUM_CLASS_FLAGS(Enum) \ + inline Enum& operator|=(Enum& Lhs, Enum Rhs) { return Lhs = (Enum)((__underlying_type(Enum))Lhs | (__underlying_type(Enum))Rhs); } \ + inline Enum& operator&=(Enum& Lhs, Enum Rhs) { return Lhs = (Enum)((__underlying_type(Enum))Lhs & (__underlying_type(Enum))Rhs); } \ + inline Enum& operator^=(Enum& Lhs, Enum Rhs) { return Lhs = (Enum)((__underlying_type(Enum))Lhs ^ (__underlying_type(Enum))Rhs); } \ + inline constexpr Enum operator|(Enum Lhs, Enum Rhs) { return (Enum)((__underlying_type(Enum))Lhs | (__underlying_type(Enum))Rhs); } \ + inline constexpr Enum operator&(Enum Lhs, Enum Rhs) { return (Enum)((__underlying_type(Enum))Lhs & (__underlying_type(Enum))Rhs); } \ + inline constexpr Enum operator^(Enum Lhs, Enum Rhs) { return (Enum)((__underlying_type(Enum))Lhs ^ (__underlying_type(Enum))Rhs); } \ + inline constexpr bool operator!(Enum E) { return !(__underlying_type(Enum))E; } \ + inline constexpr Enum operator~(Enum E) { return (Enum) ~(__underlying_type(Enum))E; } + +// Friends all bitwise operators for enum classes so the definition can be kept private / protected. +#define FRIEND_ENUM_CLASS_FLAGS(Enum) \ + friend Enum& operator|=(Enum& Lhs, Enum Rhs); \ + friend Enum& operator&=(Enum& Lhs, Enum Rhs); \ + friend Enum& operator^=(Enum& Lhs, Enum Rhs); \ + friend constexpr Enum operator|(Enum Lhs, Enum Rhs); \ + friend constexpr Enum operator&(Enum Lhs, Enum Rhs); \ + friend constexpr Enum operator^(Enum Lhs, Enum Rhs); \ + friend constexpr bool operator!(Enum E); \ + friend constexpr Enum operator~(Enum E); + +template<typename Enum> +constexpr bool +EnumHasAllFlags(Enum Flags, Enum Contains) +{ + return (((__underlying_type(Enum))Flags) & (__underlying_type(Enum))Contains) == ((__underlying_type(Enum))Contains); +} + +template<typename Enum> +constexpr bool +EnumHasAnyFlags(Enum Flags, Enum Contains) +{ + return (((__underlying_type(Enum))Flags) & (__underlying_type(Enum))Contains) != 0; +} + +template<typename Enum> +void +EnumAddFlags(Enum& Flags, Enum FlagsToAdd) +{ + Flags |= FlagsToAdd; +} + +template<typename Enum> +void +EnumRemoveFlags(Enum& Flags, Enum FlagsToRemove) +{ + Flags &= ~FlagsToRemove; +} + +} // namespace zen diff --git a/zencore/include/zencore/except.h b/zencore/include/zencore/except.h new file mode 100644 index 000000000..07c4833ff --- /dev/null +++ b/zencore/include/zencore/except.h @@ -0,0 +1,60 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/string.h> +#include <zencore/windows.h> +#include <string> + +namespace zen { + +class WindowsException : public std::exception +{ +public: + WindowsException(const char* Message) + { + m_hResult = HRESULT_FROM_WIN32(GetLastError()); + m_Message = Message; + } + + WindowsException(HRESULT hRes, const char* Message) + { + m_hResult = hRes; + m_Message = Message; + } + + WindowsException(HRESULT hRes, const char* Message, const char* Detail) + { + m_hResult = hRes; + + ExtendableStringBuilder<128> msg; + msg.Append(Message); + msg.Append(" (detail: '"); + msg.Append(Detail); + msg.Append("')"); + + m_Message = msg.c_str(); + } + + virtual const char* what() const override { return m_Message.c_str(); } + +private: + std::string m_Message; + HRESULT m_hResult; +}; + +ZENCORE_API void ThrowSystemException(HRESULT hRes, const char* Message); +inline void +ThrowSystemException(const char* Message) +{ + throw WindowsException(Message); +} + +inline void +ThrowIfFailed(HRESULT hRes, const char* Message) +{ + if (FAILED(hRes)) + ThrowSystemException(hRes, Message); +} + +} // namespace zen diff --git a/zencore/include/zencore/filesystem.h b/zencore/include/zencore/filesystem.h new file mode 100644 index 000000000..b20a1e7c6 --- /dev/null +++ b/zencore/include/zencore/filesystem.h @@ -0,0 +1,74 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "stream.h" +#include "zencore.h" + +#include <filesystem> +#include <functional> + +namespace zen { + +class IoBuffer; + +/** Delete directory (after deleting any contents) + */ +ZENCORE_API bool DeleteDirectories(const wchar_t* dir); +ZENCORE_API bool DeleteDirectories(const std::filesystem::path& dir); + +/** Ensure directory exists. + + Will also create any required parent directories + */ +ZENCORE_API bool CreateDirectories(const wchar_t* dir); +ZENCORE_API bool CreateDirectories(const std::filesystem::path& dir); + +/** Ensure directory exists and delete contents (if any) before returning + */ +ZENCORE_API bool CleanDirectory(const wchar_t* dir); +ZENCORE_API bool CleanDirectory(const std::filesystem::path& dir); + +struct FileContents +{ + std::vector<IoBuffer> Data; + std::error_code ErrorCode; +}; + +ZENCORE_API FileContents ReadFile(std::filesystem::path Path); +ZENCORE_API bool ScanFile(std::filesystem::path Path, uint64_t ChunkSize, std::function<void(const void* Data, size_t Size)>&& ProcessFunc); +ZENCORE_API bool WriteFile(std::filesystem::path Path, const IoBuffer* const* Data, size_t BufferCount); + +struct CopyFileOptions +{ + bool EnableClone = true; + bool MustClone = false; +}; + +ZENCORE_API bool CopyFile(std::filesystem::path FromPath, std::filesystem::path ToPath, const CopyFileOptions& Options); +ZENCORE_API bool SupportsBlockRefCounting(std::filesystem::path Path); + +ZENCORE_API std::string ToUtf8(const std::filesystem::path& Path); + +/** + * Efficient file system traversal + * + * Uses the best available mechanism for the platform in question and could take + * advantage of any file system tracking mechanisms in the future + * + */ +class FileSystemTraversal +{ +public: + struct TreeVisitor + { + virtual void VisitFile(const std::filesystem::path& Parent, const std::wstring_view& File, uint64_t FileSize) = 0; + + // This should return true if we should recurse into the directory + virtual bool VisitDirectory(const std::filesystem::path& Parent, const std::wstring_view& DirectoryName) = 0; + }; + + void TraverseFileSystem(const std::filesystem::path& RootDir, TreeVisitor& Visitor); +}; + +} // namespace zen diff --git a/zencore/include/zencore/fmtutils.h b/zencore/include/zencore/fmtutils.h new file mode 100644 index 000000000..fb5a08d56 --- /dev/null +++ b/zencore/include/zencore/fmtutils.h @@ -0,0 +1,49 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/uid.h> + +#include <fmt/format.h> +#include <filesystem> +#include <string_view> + +// Custom formatting for some zencore types + +template<> +struct fmt::formatter<zen::IoHash> : formatter<string_view> +{ + template<typename FormatContext> + auto format(const zen::IoHash& Hash, FormatContext& ctx) + { + zen::IoHash::String_t String; + Hash.ToHexString(String); + return formatter<string_view>::format({String, zen::IoHash::StringLength}, ctx); + } +}; + +template<> +struct fmt::formatter<zen::Oid> : formatter<string_view> +{ + template<typename FormatContext> + auto format(const zen::Oid& Id, FormatContext& ctx) + { + zen::StringBuilder<32> String; + Id.ToString(String); + return formatter<string_view>::format({String.c_str(), zen::Oid::StringLength}, ctx); + } +}; + +template<> +struct fmt::formatter<std::filesystem::path> : formatter<string_view> +{ + template<typename FormatContext> + auto format(const std::filesystem::path& Path, FormatContext& ctx) + { + zen::ExtendableStringBuilder<128> String; + WideToUtf8(Path.c_str(), String); + return formatter<string_view>::format(String.ToView(), ctx); + } +}; diff --git a/zencore/include/zencore/httpclient.h b/zencore/include/zencore/httpclient.h new file mode 100644 index 000000000..4ede6839c --- /dev/null +++ b/zencore/include/zencore/httpclient.h @@ -0,0 +1,18 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/string.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +class HttpClient +{ +}; + +} // namespace zen + +void httpclient_forcelink(); // internal diff --git a/zencore/include/zencore/httpserver.h b/zencore/include/zencore/httpserver.h new file mode 100644 index 000000000..563245264 --- /dev/null +++ b/zencore/include/zencore/httpserver.h @@ -0,0 +1,373 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/enumflags.h> +#include <zencore/refcount.h> +#include <zencore/string.h> +#include <functional> +#include <gsl/gsl-lite.hpp> +#include <list> +#include <regex> +#include <span> +#include <unordered_map> +#include "zencore.h" + +namespace zen { + +class IoBuffer; +class CbObject; +class StringBuilderBase; + +enum class HttpVerb +{ + kGet = 1 << 0, + kPut = 1 << 1, + kPost = 1 << 2, + kDelete = 1 << 3, + kHead = 1 << 4, + kCopy = 1 << 5, + kOptions = 1 << 6 +}; + +gsl_DEFINE_ENUM_BITMASK_OPERATORS(HttpVerb); + +enum class HttpResponse +{ + // 1xx - Informational + + Continue = 100, //!< Indicates that the initial part of a request has been received and has not yet been rejected by the server. + SwitchingProtocols = 101, //!< Indicates that the server understands and is willing to comply with the client's request, via the + //!< Upgrade header field, for a change in the application protocol being used on this connection. + Processing = 102, //!< Is an interim response used to inform the client that the server has accepted the complete request, but has not + //!< yet completed it. + EarlyHints = 103, //!< Indicates to the client that the server is likely to send a final response with the header fields included in + //!< the informational response. + + // 2xx - Successful + + OK = 200, //!< Indicates that the request has succeeded. + Created = 201, //!< Indicates that the request has been fulfilled and has resulted in one or more new resources being created. + Accepted = 202, //!< Indicates that the request has been accepted for processing, but the processing has not been completed. + NonAuthoritativeInformation = 203, //!< Indicates that the request was successful but the enclosed payload has been modified from that + //!< of the origin server's 200 (OK) response by a transforming proxy. + NoContent = 204, //!< Indicates that the server has successfully fulfilled the request and that there is no additional content to send + //!< in the response payload body. + ResetContent = 205, //!< Indicates that the server has fulfilled the request and desires that the user agent reset the \"document + //!< view\", which caused the request to be sent, to its original state as received from the origin server. + PartialContent = 206, //!< Indicates that the server is successfully fulfilling a range request for the target resource by transferring + //!< one or more parts of the selected representation that correspond to the satisfiable ranges found in the + //!< requests's Range header field. + MultiStatus = 207, //!< Provides status for multiple independent operations. + AlreadyReported = 208, //!< Used inside a DAV:propstat response element to avoid enumerating the internal members of multiple bindings + //!< to the same collection repeatedly. [RFC 5842] + IMUsed = 226, //!< The server has fulfilled a GET request for the resource, and the response is a representation of the result of one + //!< or more instance-manipulations applied to the current instance. + + // 3xx - Redirection + + MultipleChoices = 300, //!< Indicates that the target resource has more than one representation, each with its own more specific + //!< identifier, and information about the alternatives is being provided so that the user (or user agent) can + //!< select a preferred representation by redirecting its request to one or more of those identifiers. + MovedPermanently = 301, //!< Indicates that the target resource has been assigned a new permanent URI and any future references to this + //!< resource ought to use one of the enclosed URIs. + Found = 302, //!< Indicates that the target resource resides temporarily under a different URI. + SeeOther = 303, //!< Indicates that the server is redirecting the user agent to a different resource, as indicated by a URI in the + //!< Location header field, that is intended to provide an indirect response to the original request. + NotModified = 304, //!< Indicates that a conditional GET request has been received and would have resulted in a 200 (OK) response if it + //!< were not for the fact that the condition has evaluated to false. + UseProxy = 305, //!< \deprecated \parblock Due to security concerns regarding in-band configuration of a proxy. \endparblock + //!< The requested resource MUST be accessed through the proxy given by the Location field. + TemporaryRedirect = 307, //!< Indicates that the target resource resides temporarily under a different URI and the user agent MUST NOT + //!< change the request method if it performs an automatic redirection to that URI. + PermanentRedirect = 308, //!< The target resource has been assigned a new permanent URI and any future references to this resource + //!< ought to use one of the enclosed URIs. [...] This status code is similar to 301 Moved Permanently + //!< (Section 7.3.2 of rfc7231), except that it does not allow rewriting the request method from POST to GET. + + // 4xx - Client Error + BadRequest = 400, //!< Indicates that the server cannot or will not process the request because the received syntax is invalid, + //!< nonsensical, or exceeds some limitation on what the server is willing to process. + Unauthorized = 401, //!< Indicates that the request has not been applied because it lacks valid authentication credentials for the + //!< target resource. + PaymentRequired = 402, //!< *Reserved* + Forbidden = 403, //!< Indicates that the server understood the request but refuses to authorize it. + NotFound = 404, //!< Indicates that the origin server did not find a current representation for the target resource or is not willing + //!< to disclose that one exists. + MethodNotAllowed = 405, //!< Indicates that the method specified in the request-line is known by the origin server but not supported by + //!< the target resource. + NotAcceptable = 406, //!< Indicates that the target resource does not have a current representation that would be acceptable to the + //!< user agent, according to the proactive negotiation header fields received in the request, and the server is + //!< unwilling to supply a default representation. + ProxyAuthenticationRequired = + 407, //!< Is similar to 401 (Unauthorized), but indicates that the client needs to authenticate itself in order to use a proxy. + RequestTimeout = + 408, //!< Indicates that the server did not receive a complete request message within the time that it was prepared to wait. + Conflict = 409, //!< Indicates that the request could not be completed due to a conflict with the current state of the resource. + Gone = 410, //!< Indicates that access to the target resource is no longer available at the origin server and that this condition is + //!< likely to be permanent. + LengthRequired = 411, //!< Indicates that the server refuses to accept the request without a defined Content-Length. + PreconditionFailed = + 412, //!< Indicates that one or more preconditions given in the request header fields evaluated to false when tested on the server. + PayloadTooLarge = 413, //!< Indicates that the server is refusing to process a request because the request payload is larger than the + //!< server is willing or able to process. + URITooLong = 414, //!< Indicates that the server is refusing to service the request because the request-target is longer than the + //!< server is willing to interpret. + UnsupportedMediaType = 415, //!< Indicates that the origin server is refusing to service the request because the payload is in a format + //!< not supported by the target resource for this method. + RangeNotSatisfiable = 416, //!< Indicates that none of the ranges in the request's Range header field overlap the current extent of the + //!< selected resource or that the set of ranges requested has been rejected due to invalid ranges or an + //!< excessive request of small or overlapping ranges. + ExpectationFailed = 417, //!< Indicates that the expectation given in the request's Expect header field could not be met by at least + //!< one of the inbound servers. + ImATeapot = 418, //!< Any attempt to brew coffee with a teapot should result in the error code 418 I'm a teapot. + UnprocessableEntity = 422, //!< Means the server understands the content type of the request entity (hence a 415(Unsupported Media + //!< Type) status code is inappropriate), and the syntax of the request entity is correct (thus a 400 (Bad + //!< Request) status code is inappropriate) but was unable to process the contained instructions. + Locked = 423, //!< Means the source or destination resource of a method is locked. + FailedDependency = 424, //!< Means that the method could not be performed on the resource because the requested action depended on + //!< another action and that action failed. + UpgradeRequired = 426, //!< Indicates that the server refuses to perform the request using the current protocol but might be willing to + //!< do so after the client upgrades to a different protocol. + PreconditionRequired = 428, //!< Indicates that the origin server requires the request to be conditional. + TooManyRequests = 429, //!< Indicates that the user has sent too many requests in a given amount of time (\"rate limiting\"). + RequestHeaderFieldsTooLarge = + 431, //!< Indicates that the server is unwilling to process the request because its header fields are too large. + UnavailableForLegalReasons = + 451, //!< This status code indicates that the server is denying access to the resource in response to a legal demand. + + // 5xx - Server Error + + InternalServerError = + 500, //!< Indicates that the server encountered an unexpected condition that prevented it from fulfilling the request. + NotImplemented = 501, //!< Indicates that the server does not support the functionality required to fulfill the request. + BadGateway = 502, //!< Indicates that the server, while acting as a gateway or proxy, received an invalid response from an inbound + //!< server it accessed while attempting to fulfill the request. + ServiceUnavailable = 503, //!< Indicates that the server is currently unable to handle the request due to a temporary overload or + //!< scheduled maintenance, which will likely be alleviated after some delay. + GatewayTimeout = 504, //!< Indicates that the server, while acting as a gateway or proxy, did not receive a timely response from an + //!< upstream server it needed to access in order to complete the request. + HTTPVersionNotSupported = 505, //!< Indicates that the server does not support, or refuses to support, the protocol version that was + //!< used in the request message. + VariantAlsoNegotiates = + 506, //!< Indicates that the server has an internal configuration error: the chosen variant resource is configured to engage in + //!< transparent content negotiation itself, and is therefore not a proper end point in the negotiation process. + InsufficientStorage = 507, //!< Means the method could not be performed on the resource because the server is unable to store the + //!< representation needed to successfully complete the request. + LoopDetected = 508, //!< Indicates that the server terminated an operation because it encountered an infinite loop while processing a + //!< request with "Depth: infinity". [RFC 5842] + NotExtended = 510, //!< The policy for accessing the resource has not been met in the request. [RFC 2774] + NetworkAuthenticationRequired = 511, //!< Indicates that the client needs to authenticate to gain network access. +}; + +enum class HttpContentType +{ + kBinary, + kText, + kJSON, + kCbObject, + kCbPackage +}; + +/** HTTP Server Request + */ +class HttpServerRequest +{ +public: + HttpServerRequest(); + ~HttpServerRequest(); + + // Synchronous operations + + inline [[nodiscard]] std::string_view RelativeUri() const { return m_Uri; } // Returns URI without service prefix + inline [[nodiscard]] std::string_view QueryString() const { return m_QueryString; } + inline bool IsHandled() const { return m_IsHandled; } + + struct QueryParams + { + std::vector<std::pair<std::string_view, std::string_view>> KvPairs; + + std::string_view GetValue(std::string_view ParamName) + { + for (const auto& Kv : KvPairs) + { + const std::string_view& Key = Kv.first; + + if (Key.size() == ParamName.size()) + { + if (0 == _strnicmp(Key.data(), ParamName.data(), Key.size())) + { + return Kv.second; + } + } + } + + return std::string_view(); + } + }; + + QueryParams GetQueryParams(); + + inline HttpVerb RequestVerb() const { return m_Verb; } + + const char* HeaderAccept() const; + const char* HeaderAcceptEncoding() const; + const char* HeaderContentType() const; + const char* HeaderContentEncoding() const; + inline uint64_t HeaderContentLength() const { return m_ContentLength; } + + void SetSuppressResponseBody() { m_SuppressBody = true; } + + // Asynchronous operations + + /** Read POST/PUT payload + + This will return a null buffer if the contents are not fully available yet, and the handler should + at that point return - another completion request will be issues once the contents have been received + fully. + */ + virtual IoBuffer ReadPayload() = 0; + + /** Respond with payload + + Note that this is destructive in the sense that the IoBuffer instances referred to by Blobs will be + moved into our response handler array where they are kept alive, in order to reduce ref-counting storms + */ + virtual void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::span<IoBuffer> Blobs) = 0; + virtual void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, IoBuffer Blob); + virtual void WriteResponse(HttpResponse HttpResponseCode) = 0; + + virtual void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::u8string_view ResponseString) = 0; + + void WriteResponse(HttpResponse HttpResponseCode, CbObject Data); + void WriteResponse(HttpResponse HttpResponseCode, HttpContentType ContentType, std::string_view ResponseString); + +protected: + bool m_IsHandled = false; + bool m_SuppressBody = false; + HttpVerb m_Verb = HttpVerb::kGet; + uint64_t m_ContentLength = ~0ull; + ExtendableStringBuilder<256> m_Uri; + ExtendableStringBuilder<256> m_QueryString; +}; + +class HttpServerException : public std::exception +{ +public: + HttpServerException(const char* Message, uint32_t Error) : std::exception(Message), m_ErrorCode(Error) {} + +private: + uint32_t m_ErrorCode; +}; + +class HttpService +{ +public: + HttpService() = default; + virtual ~HttpService() = default; + + virtual const char* BaseUri() const = 0; + virtual void HandleRequest(HttpServerRequest& HttpServiceRequest) = 0; + + // Internals + + inline void SetUriPrefixLength(size_t PrefixLength) { m_UriPrefixLength = (int)PrefixLength; } + inline int UriPrefixLength() const { return m_UriPrefixLength; } + +private: + int m_UriPrefixLength = 0; +}; + +/** HTTP server + */ +class HttpServer +{ +public: + HttpServer(); + ~HttpServer(); + + void AddEndpoint(const char* endpoint, std::function<void(HttpServerRequest&)> handler); + void AddEndpoint(HttpService& Service); + + void Initialize(int BasePort); + void Run(bool TestMode); + void RequestExit(); + +private: + struct Impl; + + RefPtr<Impl> m_Impl; +}; + +////////////////////////////////////////////////////////////////////////// + +class HttpRouterRequest +{ +public: + HttpRouterRequest(HttpServerRequest& Request) : m_HttpRequest(Request) {} + + ZENCORE_API std::string GetCapture(int Index) const; + inline HttpServerRequest& ServerRequest() { return m_HttpRequest; } + +private: + using MatchResults_t = std::match_results<std::string_view::const_iterator>; + + HttpServerRequest& m_HttpRequest; + MatchResults_t m_Match; + + friend class HttpRequestRouter; +}; + +inline std::string +HttpRouterRequest::GetCapture(int Index) const +{ + ZEN_ASSERT(Index < m_Match.size()); + + return m_Match[Index]; +} + +////////////////////////////////////////////////////////////////////////// + +class HttpRequestRouter +{ +public: + typedef std::function<void(HttpRouterRequest&)> HandlerFunc_t; + + void AddPattern(const char* Id, const char* Regex); + void RegisterRoute(const char* Regex, HandlerFunc_t&& HandlerFunc, HttpVerb SupportedVerbs); + bool HandleRequest(zen::HttpServerRequest& Request); + +private: + struct HandlerEntry + { + HandlerEntry(const char* Regex, HttpVerb SupportedVerbs, HandlerFunc_t&& Handler, const char* Pattern) + : RegEx(Regex, std::regex::icase | std::regex::ECMAScript) + , Verbs(SupportedVerbs) + , Handler(std::move(Handler)) + , Pattern(Pattern) + { + } + + ~HandlerEntry() = default; + + std::regex RegEx; + HttpVerb Verbs; + HandlerFunc_t Handler; + const char* Pattern; + }; + + std::list<HandlerEntry> m_Handlers; + std::unordered_map<std::string, std::string> m_PatternMap; +}; + +////////////////////////////////////////////////////////////////////////// +// +// HTTP Client +// + +class HttpClient +{ +}; + +} // namespace zen + +void http_forcelink(); // internal diff --git a/zencore/include/zencore/intmath.h b/zencore/include/zencore/intmath.h new file mode 100644 index 000000000..9d39b8226 --- /dev/null +++ b/zencore/include/zencore/intmath.h @@ -0,0 +1,140 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <stdint.h> +#include <concepts> + +////////////////////////////////////////////////////////////////////////// + +#pragma intrinsic(_BitScanReverse) +#pragma intrinsic(_BitScanReverse64) + +namespace zen { + +inline constexpr bool +IsPow2(uint64_t n) +{ + return 0 == (n & (n - 1)); +} + +/// Round an integer up to the closest integer multiplier of 'base' ('base' must be a power of two) +template<std::integral T> +T +RoundUp(T Value, auto Base) +{ + ZEN_ASSERT_SLOW(IsPow2(Base)); + return ((Value + T(Base - 1)) & (~T(Base - 1))); +} + +bool +IsMultipleOf(std::integral auto Value, auto MultiplierPow2) +{ + ZEN_ASSERT_SLOW(IsPow2(MultiplierPow2)); + return (Value & (MultiplierPow2 - 1)) == 0; +} + +inline uint64_t +NextPow2(uint64_t n) +{ + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + + --n; + + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + + return n + 1; +} + +static inline uint32_t +FloorLog2(uint32_t Value) +{ + // Use BSR to return the log2 of the integer + unsigned long Log2; + if (_BitScanReverse(&Log2, Value) != 0) + { + return Log2; + } + + return 0; +} + +static inline uint32_t +CountLeadingZeros(uint32_t Value) +{ + unsigned long Log2; + _BitScanReverse64(&Log2, (uint64_t(Value) << 1) | 1); + return 32 - Log2; +} + +static inline uint64_t +FloorLog2_64(uint64_t Value) +{ + unsigned long Log2; + long Mask = -long(_BitScanReverse64(&Log2, Value) != 0); + return Log2 & Mask; +} + +static inline uint64_t +CountLeadingZeros64(uint64_t Value) +{ + unsigned long Log2; + long Mask = -long(_BitScanReverse64(&Log2, Value) != 0); + return ((63 - Log2) & Mask) | (64 & ~Mask); +} + +static inline uint64_t +CeilLogTwo64(uint64_t Arg) +{ + int64_t Bitmask = ((int64_t)(CountLeadingZeros64(Arg) << 57)) >> 63; + return (64 - CountLeadingZeros64(Arg - 1)) & (~Bitmask); +} + +static inline uint64_t +CountTrailingZeros64(uint64_t Value) +{ + if (Value == 0) + { + return 64; + } + unsigned long BitIndex; // 0-based, where the LSB is 0 and MSB is 31 + _BitScanForward64(&BitIndex, Value); // Scans from LSB to MSB + return BitIndex; +} + +////////////////////////////////////////////////////////////////////////// + +static inline bool +IsPointerAligned(const void* Ptr, uint64_t Alignment) +{ + ZEN_ASSERT_SLOW(IsPow2(Alignment)); + + return 0 == (reinterpret_cast<uintptr_t>(Ptr) & (Alignment - 1)); +} + +////////////////////////////////////////////////////////////////////////// + +#ifdef min +# error "Looks like you did #include <windows.h> -- use <zencore/windows.h> instead" +#endif + +auto +Min(auto x, auto y) +{ + return x < y ? x : y; +} + +auto +Max(auto x, auto y) +{ + return x > y ? x : y; +} + +} // namespace zen diff --git a/zencore/include/zencore/iobuffer.h b/zencore/include/zencore/iobuffer.h new file mode 100644 index 000000000..c93d27959 --- /dev/null +++ b/zencore/include/zencore/iobuffer.h @@ -0,0 +1,272 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <memory.h> +#include "refcount.h" +#include "zencore.h" + +namespace zen { + +struct IoBufferExtendedCore; + +struct IoBufferFileReference +{ + void* FileHandle; + uint64_t FileChunkOffset; + uint64_t FileChunkSize; +}; + +struct IoBufferCore +{ +public: + IoBufferCore() = default; + inline IoBufferCore(const void* DataPtr, size_t SizeBytes) : m_DataPtr(DataPtr), m_DataBytes(SizeBytes) {} + inline IoBufferCore(const IoBufferCore* Outer, const void* DataPtr, size_t SizeBytes) + : m_DataPtr(DataPtr) + , m_DataBytes(SizeBytes) + , m_OuterCore(Outer) + { + } + + ZENCORE_API explicit IoBufferCore(size_t SizeBytes); + ZENCORE_API IoBufferCore(size_t SizeBytes, size_t Alignment); + ZENCORE_API ~IoBufferCore(); + + // Reference counting + + inline uint32_t AddRef() const { return AtomicIncrement(const_cast<IoBufferCore*>(this)->m_RefCount); } + inline uint32_t Release() const + { + const uint32_t NewRefCount = AtomicDecrement(const_cast<IoBufferCore*>(this)->m_RefCount); + if (NewRefCount == 0) + { + DeleteThis(); + } + return NewRefCount; + } + + // Copying reference counted objects doesn't make a lot of sense generally, so let's prevent it + + IoBufferCore(const IoBufferCore&) = delete; + IoBufferCore(IoBufferCore&&) = delete; + IoBufferCore& operator=(const IoBufferCore&) = delete; + IoBufferCore& operator=(IoBufferCore&&) = delete; + + // + + ZENCORE_API void Materialize() const; + ZENCORE_API void DeleteThis() const; + ZENCORE_API void MakeOwned(bool Immutable = true); + + inline void EnsureDataValid() const + { + if ((m_Flags & kIsExtended) && !(m_Flags & kIsMaterialized)) + Materialize(); + } + + inline bool IsOwned() const { return !!(m_Flags & kIsOwned); } + inline bool IsImmutable() const { return !(m_Flags & kIsMutable); } + inline bool IsNull() const { return m_DataBytes == 0; } + + inline IoBufferExtendedCore* ExtendedCore(); + inline const IoBufferExtendedCore* ExtendedCore() const; + + inline const void* DataPointer() const + { + EnsureDataValid(); + return m_DataPtr; + } + + inline size_t DataBytes() const { return m_DataBytes; } + + inline void Set(const void* Ptr, size_t Sz) + { + m_DataPtr = Ptr; + m_DataBytes = Sz; + } + + inline void SetIsOwned(bool NewState) + { + if (NewState) + { + m_Flags |= kIsOwned; + } + else + { + m_Flags &= ~kIsOwned; + } + } + + inline void SetIsImmutable(bool NewState) + { + if (!NewState) + { + m_Flags |= kIsMutable; + } + else + { + m_Flags &= ~kIsMutable; + } + } + + inline uint32_t GetRefCount() const { return m_RefCount; } + +protected: + uint32_t m_RefCount = 0; + mutable uint32_t m_Flags = 0; + mutable const void* m_DataPtr = nullptr; + size_t m_DataBytes = 0; + RefPtr<const IoBufferCore> m_OuterCore; + + enum Flags + { + kIsOwned = 1 << 0, + kIsMutable = 1 << 1, + kIsExtended = 1 << 2, // Is actually a SharedBufferExtendedCore + kIsMaterialized = 1 << 3, // Data pointers are valid + kLowLevelAlloc = 1 << 4, // Using direct memory allocation + }; + + void* AllocateBuffer(size_t InSize, size_t Alignment); + void FreeBuffer(); +}; + +/** + * An "Extended" core references a segment of a file + */ + +struct IoBufferExtendedCore : public IoBufferCore +{ + IoBufferExtendedCore(void* FileHandle, uint64_t Offset, uint64_t Size, bool TransferHandleOwnership); + IoBufferExtendedCore(const IoBufferExtendedCore* Outer, uint64_t Offset, uint64_t Size); + ~IoBufferExtendedCore(); + + enum ExtendedFlags + { + kOwnsFile = 1 << 8, + kOwnsMmap = 1 << 9 + }; + + void Materialize() const; + bool GetFileReference(IoBufferFileReference& OutRef) const; + +private: + void* m_FileHandle = nullptr; + uint64_t m_FileOffset = 0; + mutable void* m_MmapHandle = nullptr; + mutable void* m_MappedPointer = nullptr; +}; + +inline IoBufferExtendedCore* +IoBufferCore::ExtendedCore() +{ + if (m_Flags & kIsExtended) + { + return static_cast<IoBufferExtendedCore*>(this); + } + + return nullptr; +} + +inline const IoBufferExtendedCore* +IoBufferCore::ExtendedCore() const +{ + if (m_Flags & kIsExtended) + { + return static_cast<const IoBufferExtendedCore*>(this); + } + + return nullptr; +} + +/** + * I/O buffer + * + * This represents a reference to a payload in memory or on disk + * + */ +class IoBuffer +{ +public: + enum EAssumeOwnershipTag + { + AssumeOwnership + }; + enum ECloneTag + { + Clone + }; + enum EWrapTag + { + Wrap + }; + enum EFileTag + { + File + }; + enum EBorrowedFileTag + { + BorrowedFile + }; + + inline IoBuffer() = default; + inline IoBuffer(IoBuffer&& Rhs) noexcept = default; + inline IoBuffer(const IoBuffer& Rhs) = default; + inline IoBuffer& operator=(const IoBuffer& Rhs) = default; + inline IoBuffer& operator=(IoBuffer&& Rhs) noexcept = default; + + /** Create an uninitialized buffer of the given size + */ + ZENCORE_API explicit IoBuffer(size_t InSize); + + /** Create an uninitialized buffer of the given size with the specified alignment + */ + ZENCORE_API explicit IoBuffer(size_t InSize, uint64_t InAlignment); + + /** Create a buffer which references a sequence of bytes inside another buffer + */ + ZENCORE_API IoBuffer(const IoBuffer& OuterBuffer, size_t Offset, size_t SizeBytes); + + /** Create a buffer which references a range of bytes which we assume will live + * for the entire life time. + */ + inline IoBuffer(EWrapTag, const void* DataPtr, size_t SizeBytes) : m_Core(new IoBufferCore(DataPtr, SizeBytes)) {} + + inline IoBuffer(ECloneTag, const void* DataPtr, size_t SizeBytes) : m_Core(new IoBufferCore(SizeBytes)) + { + memcpy(const_cast<void*>(m_Core->DataPointer()), DataPtr, SizeBytes); + } + + inline IoBuffer(EAssumeOwnershipTag, const void* DataPtr, size_t Sz) : m_Core(new IoBufferCore(DataPtr, Sz)) + { + m_Core->SetIsOwned(true); + } + + ZENCORE_API IoBuffer(EFileTag, void* FileHandle, uint64_t ChunkFileOffset, uint64_t ChunkSize); + ZENCORE_API IoBuffer(EBorrowedFileTag, void* FileHandle, uint64_t ChunkFileOffset, uint64_t ChunkSize); + + inline operator bool() const { return !m_Core->IsNull(); } + ZENCORE_API void MakeOwned() { return m_Core->MakeOwned(); } + inline bool IsOwned() const { return m_Core->IsOwned(); } + const void* Data() const { return m_Core->DataPointer(); } + const size_t Size() const { return m_Core->DataBytes(); } + ZENCORE_API bool GetFileReference(IoBufferFileReference& OutRef) const; + +private: + RefPtr<IoBufferCore> m_Core = new IoBufferCore; +}; + +class IoBufferBuilder +{ +public: + ZENCORE_API static IoBuffer MakeFromFile(const wchar_t* FileName, uint64_t Offset = 0, uint64_t Size = ~0ull); + ZENCORE_API static IoBuffer MakeFromFileHandle(void* FileHandle, uint64_t Offset = 0, uint64_t Size = ~0ull); + inline static IoBuffer MakeCloneFromMemory(const void* Ptr, size_t Sz) { return IoBuffer(IoBuffer::Clone, Ptr, Sz); } + +private: +}; + +void iobuffer_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/iohash.h b/zencore/include/zencore/iohash.h new file mode 100644 index 000000000..4eac7e328 --- /dev/null +++ b/zencore/include/zencore/iohash.h @@ -0,0 +1,95 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/blake3.h> +#include <zencore/memory.h> + +#include <string_view> + +namespace zen { + +class StringBuilderBase; + +/** + * Hash used for content addressable storage + * + * This is basically a BLAKE3-160 hash (note: this is probably not an officially + * recognized identifier). It is generated by computing a 32-byte BLAKE3 hash and + * picking the first 20 bytes of the resulting hash. + * + */ +struct IoHash +{ + uint8_t Hash[20]; + + static IoHash MakeFrom(const void* data /* 20 bytes */) + { + IoHash Io; + memcpy(Io.Hash, data, sizeof Io); + return Io; + } + + static IoHash HashMemory(const void* data, size_t byteCount); + static IoHash HashMemory(MemoryView Data) { return HashMemory(Data.GetData(), Data.GetSize()); } + static IoHash FromHexString(const char* string); + static IoHash FromHexString(const std::string_view string); + const char* ToHexString(char* outString /* 40 characters + NUL terminator */) const; + StringBuilderBase& ToHexString(StringBuilderBase& outBuilder) const; + std::string ToHexString() const; + + static const int StringLength = 40; + typedef char String_t[StringLength + 1]; + + static IoHash Zero; // Initialized to all zeros + + inline auto operator<=>(const IoHash& rhs) const = default; + + struct Hasher + { + size_t operator()(const IoHash& v) const + { + size_t h; + memcpy(&h, v.Hash, sizeof h); + return h; + } + }; +}; + +struct IoHashStream +{ + /// Begin streaming hash compute (not needed on freshly constructed instance) + void Reset() { m_Blake3Stream.Reset(); } + + /// Append another chunk + IoHashStream& Append(const void* data, size_t byteCount) + { + m_Blake3Stream.Append(data, byteCount); + return *this; + } + + /// Append another chunk + IoHashStream& Append(MemoryView Data) + { + m_Blake3Stream.Append(Data.GetData(), Data.GetSize()); + return *this; + } + + /// Obtain final hash. If you wish to reuse the instance call reset() + IoHash GetHash() + { + BLAKE3 b3 = m_Blake3Stream.GetHash(); + + IoHash Io; + memcpy(Io.Hash, b3.Hash, sizeof Io.Hash); + + return Io; + } + +private: + BLAKE3Stream m_Blake3Stream; +}; + +} // namespace zen diff --git a/zencore/include/zencore/md5.h b/zencore/include/zencore/md5.h new file mode 100644 index 000000000..4ed4f6c56 --- /dev/null +++ b/zencore/include/zencore/md5.h @@ -0,0 +1,50 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <stdint.h> +#include <compare> +#include "zencore.h" + +namespace zen { + +class StringBuilderBase; + +struct MD5 +{ + uint8_t Hash[16]; + + inline auto operator<=>(const MD5& rhs) const = default; + + static const int StringLength = 40; + typedef char String_t[StringLength + 1]; + + static MD5 HashMemory(const void* data, size_t byteCount); + static MD5 FromHexString(const char* string); + const char* ToHexString(char* outString /* 40 characters + NUL terminator */) const; + StringBuilderBase& ToHexString(StringBuilderBase& outBuilder) const; + + static MD5 Zero; // Initialized to all zeroes +}; + +/** + * Utility class for computing SHA1 hashes + */ +class MD5Stream +{ +public: + MD5Stream(); + + /// Begin streaming MD5 compute (not needed on freshly constructed MD5Stream instance) + void Reset(); + /// Append another chunk + MD5Stream& Append(const void* data, size_t byteCount); + /// Obtain final MD5 hash. If you wish to reuse the MD5Stream instance call reset() + MD5 GetHash(); + +private: +}; + +void md5_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/memory.h b/zencore/include/zencore/memory.h new file mode 100644 index 000000000..8a16126ef --- /dev/null +++ b/zencore/include/zencore/memory.h @@ -0,0 +1,213 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/thread.h> + +#include <algorithm> +#include <vector> + +namespace zen { + +class MemoryArena +{ +public: + ZENCORE_API MemoryArena(); + ZENCORE_API ~MemoryArena(); + + ZENCORE_API void* Alloc(size_t size, size_t alignment); + ZENCORE_API void Free(void* ptr); + +private: +}; + +class Memory +{ +public: + ZENCORE_API static void* Alloc(size_t size, size_t alignment = sizeof(void*)); + ZENCORE_API static void Free(void* ptr); +}; + +/** Allocator which claims fixed-size blocks from the underlying allocator. + + There is no way to free individual memory blocks. + + \note This is not thread-safe, you will need to provide synchronization yourself +*/ + +class ChunkingLinearAllocator +{ +public: + ChunkingLinearAllocator(uint64_t ChunkSize, uint64_t ChunkAlignment = sizeof(std::max_align_t)); + ~ChunkingLinearAllocator(); + + ZENCORE_API void Reset(); + + ZENCORE_API void* Alloc(size_t Size, size_t Alignment = sizeof(void*)); + inline void Free(void* Ptr) { ZEN_UNUSED(Ptr); /* no-op */ } + +private: + uint8_t* m_ChunkCursor = nullptr; + uint64_t m_ChunkBytesRemain = 0; + const uint64_t m_ChunkSize = 0; + const uint64_t m_ChunkAlignment = 0; + std::vector<void*> m_ChunkList; +}; + +////////////////////////////////////////////////////////////////////////// + +struct MutableMemoryView +{ + MutableMemoryView() = default; + + MutableMemoryView(void* DataPtr, size_t DataSize) + : m_Data(reinterpret_cast<uint8_t*>(DataPtr)) + , m_DataEnd(reinterpret_cast<uint8_t*>(DataPtr) + DataSize) + { + } + + inline bool IsEmpty() const { return m_Data == m_DataEnd; } + void* GetData() const { return m_Data; } + void* GetDataEnd() const { return m_DataEnd; } + size_t GetSize() const { return reinterpret_cast<uint8_t*>(m_DataEnd) - reinterpret_cast<uint8_t*>(m_Data); } + + inline bool EqualBytes(const MutableMemoryView& InView) const + { + const size_t Size = GetSize(); + + return Size == InView.GetSize() && (memcmp(m_Data, InView.m_Data, Size) == 0); + } + + /** Modifies the view by chopping the given number of bytes from the left. */ + inline void RightChopInline(uint64_t InSize) + { + const uint64_t Offset = std::min(GetSize(), InSize); + m_Data = GetDataAtOffsetNoCheck(Offset); + } + + /** Returns the left-most part of the view by taking the given number of bytes from the left. */ + constexpr inline MutableMemoryView Left(uint64_t InSize) const + { + MutableMemoryView View(*this); + View.LeftInline(InSize); + return View; + } + + /** Modifies the view to be the given number of bytes from the left. */ + constexpr inline void LeftInline(uint64_t InSize) { m_DataEnd = std::min(m_DataEnd, m_Data + InSize); } + +private: + uint8_t* m_Data = nullptr; + uint8_t* m_DataEnd = nullptr; + + /** Returns the data pointer advanced by an offset in bytes. */ + inline uint8_t* GetDataAtOffsetNoCheck(uint64_t InOffset) const { return m_Data + InOffset; } +}; + +struct MemoryView +{ + MemoryView() = default; + + MemoryView(const MutableMemoryView& MutableView) + : m_Data(reinterpret_cast<const uint8_t*>(MutableView.GetData())) + , m_DataEnd(m_Data + MutableView.GetSize()) + { + } + + MemoryView(const void* DataPtr, size_t DataSize) + : m_Data(reinterpret_cast<const uint8_t*>(DataPtr)) + , m_DataEnd(reinterpret_cast<const uint8_t*>(DataPtr) + DataSize) + { + } + + MemoryView(const void* DataPtr, const void* DataEndPtr) + : m_Data(reinterpret_cast<const uint8_t*>(DataPtr)) + , m_DataEnd(reinterpret_cast<const uint8_t*>(DataEndPtr)) + { + } + + inline bool Contains(const MemoryView& Other) const { return (m_Data <= Other.m_Data) && (m_DataEnd >= Other.m_DataEnd); } + inline bool IsEmpty() const { return m_Data == m_DataEnd; } + const void* GetData() const { return m_Data; } + const void* GetDataEnd() const { return m_DataEnd; } + size_t GetSize() const { return reinterpret_cast<const uint8_t*>(m_DataEnd) - reinterpret_cast<const uint8_t*>(m_Data); } + inline bool operator==(const MemoryView& Rhs) const { return m_Data == Rhs.m_Data && m_DataEnd == Rhs.m_DataEnd; } + + inline bool EqualBytes(const MemoryView& InView) const + { + const size_t Size = GetSize(); + + return Size == InView.GetSize() && (memcmp(m_Data, InView.GetData(), Size) == 0); + } + + inline MemoryView& operator+=(size_t InSize) + { + RightChopInline(InSize); + return *this; + } + + /** Modifies the view by chopping the given number of bytes from the left. */ + inline void RightChopInline(uint64_t InSize) + { + const uint64_t Offset = std::min(GetSize(), InSize); + m_Data = GetDataAtOffsetNoCheck(Offset); + } + + inline MemoryView RightChop(uint64_t InSize) + { + MemoryView View(*this); + View.RightChopInline(InSize); + return View; + } + + /** Returns the left-most part of the view by taking the given number of bytes from the left. */ + constexpr inline MemoryView Left(uint64_t InSize) const + { + MemoryView View(*this); + View.LeftInline(InSize); + return View; + } + + /** Modifies the view to be the given number of bytes from the left. */ + constexpr inline void LeftInline(uint64_t InSize) { m_DataEnd = std::min(m_DataEnd, m_Data + InSize); } + + constexpr void Reset() + { + m_Data = nullptr; + m_DataEnd = nullptr; + } + +private: + const uint8_t* m_Data = nullptr; + const uint8_t* m_DataEnd = nullptr; + + /** Returns the data pointer advanced by an offset in bytes. */ + inline const uint8_t* GetDataAtOffsetNoCheck(uint64_t InOffset) const { return m_Data + InOffset; } +}; + +/** + * Make a non-owning view of the memory of the initializer list. + * + * This overload is only available when the element type does not need to be deduced. + */ +template<typename T> +[[nodiscard]] inline MemoryView +MakeMemoryView(std::initializer_list<typename std::type_identity<T>::type> List) +{ + return MemoryView(List.begin(), List.size() * sizeof(T)); +} + +/** Make a non-owning view of the memory of the contiguous container. */ +template<std::ranges::contiguous_range R> +[[nodiscard]] constexpr inline MemoryView +MakeMemoryView(const R& Container) +{ + const auto& Front = *std::begin(Container); + return MemoryView(std::addressof(Front), std::size(Container) * sizeof(Front)); +} + +void memory_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/meta.h b/zencore/include/zencore/meta.h new file mode 100644 index 000000000..82eb5cc30 --- /dev/null +++ b/zencore/include/zencore/meta.h @@ -0,0 +1,30 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +/* This file contains utility functions for meta programming + * + * Since you're in here you're probably quite observant, and you'll + * note that it's quite barren here. This is because template + * metaprogramming is awful and I try not to engage in it. However, + * sometimes these things are forced upon us. + * + */ + +namespace zen { + +/** + * Uses implicit conversion to create an instance of a specific type. + * Useful to make things clearer or circumvent unintended type deduction in templates. + * Safer than C casts and static_casts, e.g. does not allow down-casts + * + * @param Obj The object (usually pointer or reference) to convert. + * + * @return The object converted to the specified type. + */ +template<typename T> +inline T +ImplicitConv(typename std::type_identity<T>::type Obj) +{ + return Obj; +} + +} // namespace zen diff --git a/zencore/include/zencore/refcount.h b/zencore/include/zencore/refcount.h new file mode 100644 index 000000000..de7d315f4 --- /dev/null +++ b/zencore/include/zencore/refcount.h @@ -0,0 +1,144 @@ +// Copyright Epic Games, Inc. All Rights Reserved. +#pragma once + +#include "atomic.h" +#include "zencore.h" + +namespace zen { + +/** + * Helper base class for reference counted objects using intrusive reference counts + * + * This class is pretty straightforward but does one thing which may be unexpected: + * + * - Instances on the stack are initialized with a reference count of one to ensure + * nobody tries to accidentally delete it. (TODO: is this really useful?) + */ +class RefCounted +{ +public: + RefCounted() : m_RefCount(IsPointerToStack(this)){}; + virtual ~RefCounted() = default; + + inline uint32_t AddRef() const { return AtomicIncrement(const_cast<RefCounted*>(this)->m_RefCount); } + inline uint32_t Release() const + { + uint32_t refCount = AtomicDecrement(const_cast<RefCounted*>(this)->m_RefCount); + if (refCount == 0) + { + delete this; + } + return refCount; + } + + // Copying reference counted objects doesn't make a lot of sense generally, so let's prevent it + + RefCounted(const RefCounted&) = delete; + RefCounted(RefCounted&&) = delete; + RefCounted& operator=(const RefCounted&) = delete; + RefCounted& operator=(RefCounted&&) = delete; + +protected: + inline uint32_t RefCount() const { return m_RefCount; } + +private: + uint32_t m_RefCount = 0; +}; + +/** + * Smart pointer for classes derived from RefCounted + */ + +template<class T> +class RefPtr +{ +public: + inline RefPtr() = default; + inline RefPtr(const RefPtr& Rhs) : m_Ref(Rhs.m_Ref) { m_Ref && m_Ref->AddRef(); } + inline RefPtr(T* Ptr) : m_Ref(Ptr) { m_Ref && m_Ref->AddRef(); } + inline ~RefPtr() { m_Ref && m_Ref->Release(); } + + inline explicit operator bool() const { return m_Ref != nullptr; } + inline operator T*() const { return m_Ref; } + inline T* operator->() const { return m_Ref; } + + inline std::strong_ordering operator<=>(const RefPtr& Rhs) const = default; + + inline RefPtr& operator=(T* Rhs) + { + Rhs && Rhs->AddRef(); + m_Ref && m_Ref->Release(); + m_Ref = Rhs; + return *this; + } + inline RefPtr& operator=(const RefPtr& Rhs) + { + m_Ref && m_Ref->Release(); + auto Ref = m_Ref = Rhs.m_Ref; + Ref && Ref->AddRef(); + return *this; + } + inline RefPtr& operator=(RefPtr&& Rhs) noexcept + { + m_Ref && m_Ref->Release(); + m_Ref = Rhs.m_Ref; + Rhs.m_Ref = nullptr; + return *this; + } + inline RefPtr(RefPtr&& Rhs) noexcept : m_Ref(Rhs.m_Ref) { Rhs.m_Ref = nullptr; } + +private: + T* m_Ref = nullptr; +}; + +/** + * Smart pointer for classes derived from RefCounted + * + * This variant does not decay to a raw pointer + * + */ + +template<class T> +class Ref +{ +public: + inline Ref() = default; + inline Ref(const Ref& Rhs) : m_Ref(Rhs.m_Ref) { m_Ref && m_Ref->AddRef(); } + inline Ref(T* Ptr) : m_Ref(Ptr) { m_Ref && m_Ref->AddRef(); } + inline ~Ref() { m_Ref && m_Ref->Release(); } + + inline explicit operator bool() const { return m_Ref != nullptr; } + inline T* operator->() const { return m_Ref; } + + inline std::strong_ordering operator<=>(const Ref& Rhs) const = default; + + inline Ref& operator=(T* Rhs) + { + Rhs && Rhs->AddRef(); + m_Ref && m_Ref->Release(); + m_Ref = Rhs; + return *this; + } + inline Ref& operator=(const Ref& Rhs) + { + m_Ref && m_Ref->Release(); + auto Ref = m_Ref = Rhs.m_Ref; + Ref && Ref->AddRef(); + return *this; + } + inline Ref& operator=(Ref&& Rhs) noexcept + { + m_Ref && m_Ref->Release(); + m_Ref = Rhs.m_Ref; + Rhs.m_Ref = nullptr; + return *this; + } + inline Ref(Ref&& Rhs) noexcept : m_Ref(Rhs.m_Ref) { Rhs.m_Ref = nullptr; } + +private: + T* m_Ref = nullptr; +}; + +void refcount_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/scopeguard.h b/zencore/include/zencore/scopeguard.h new file mode 100644 index 000000000..ba8cd3094 --- /dev/null +++ b/zencore/include/zencore/scopeguard.h @@ -0,0 +1,33 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <type_traits> +#include "zencore.h" + +namespace zen { + +template<typename T> +class ScopeGuardImpl +{ +public: + inline ScopeGuardImpl(T&& func) : m_guardFunc(func) {} + ~ScopeGuardImpl() + { + if (!m_dismissed) + m_guardFunc(); + } + + void Dismiss() { m_dismissed = true; } + +private: + bool m_dismissed = false; + T m_guardFunc; +}; + +template<typename T> +ScopeGuardImpl<T> +MakeGuard(T&& fn) +{ + return ScopeGuardImpl<T>(std::move(fn)); +} + +} // namespace zen diff --git a/zencore/include/zencore/sha1.h b/zencore/include/zencore/sha1.h new file mode 100644 index 000000000..fc26f442b --- /dev/null +++ b/zencore/include/zencore/sha1.h @@ -0,0 +1,76 @@ +// ////////////////////////////////////////////////////////// +// sha1.h +// Copyright (c) 2014,2015 Stephan Brumme. All rights reserved. +// see http://create.stephan-brumme.com/disclaimer.html +// + +#pragma once + +#include <stdint.h> +#include <compare> +#include "zencore.h" + +namespace zen { + +class StringBuilderBase; + +struct SHA1 +{ + uint8_t Hash[20]; + + inline auto operator<=>(const SHA1& rhs) const = default; + + static const int StringLength = 40; + typedef char String_t[StringLength + 1]; + + static SHA1 HashMemory(const void* data, size_t byteCount); + static SHA1 FromHexString(const char* string); + const char* ToHexString(char* outString /* 40 characters + NUL terminator */) const; + StringBuilderBase& ToHexString(StringBuilderBase& outBuilder) const; + + static SHA1 Zero; // Initialized to all zeroes +}; + +/** + * Utility class for computing SHA1 hashes + */ +class SHA1Stream +{ +public: + SHA1Stream(); + + /** compute SHA1 of a memory block + + \note SHA1 class contains a slightly more convenient helper function for this use case + \see SHA1::fromMemory() + */ + SHA1 Compute(const void* data, size_t byteCount); + + /// Begin streaming SHA1 compute (not needed on freshly constructed SHA1Stream instance) + void Reset(); + /// Append another chunk + SHA1Stream& Append(const void* data, size_t byteCount); + /// Obtain final SHA1 hash. If you wish to reuse the SHA1Stream instance call reset() + SHA1 GetHash(); + +private: + void ProcessBlock(const void* data); + void ProcessBuffer(); + + enum + { + /// split into 64 byte blocks (=> 512 bits) + BlockSize = 512 / 8, + HashBytes = 20, + HashValues = HashBytes / 4 + }; + + uint64_t m_NumBytes; // size of processed data in bytes + size_t m_BufferSize; // valid bytes in m_buffer + uint8_t m_Buffer[BlockSize]; // bytes not processed yet + uint32_t m_Hash[HashValues]; +}; + +void sha1_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/sharedbuffer.h b/zencore/include/zencore/sharedbuffer.h new file mode 100644 index 000000000..c6206f780 --- /dev/null +++ b/zencore/include/zencore/sharedbuffer.h @@ -0,0 +1,169 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/memory.h> +#include <zencore/refcount.h> + +#include <memory.h> + +namespace zen { + +class BufferOwner : public RefCounted +{ +protected: + inline BufferOwner(void* DataPtr, uint64_t DataSize, bool IsOwned, BufferOwner* OuterBuffer = nullptr) + : m_IsOwned(IsOwned) + , m_Data(DataPtr) + , m_Size(DataSize) + , m_Outer(OuterBuffer) + { + } + + virtual ~BufferOwner(); + + // Ownership is a transitive property, and m_IsOwned currently only flags that this instance is responsible + // for managing the allocated memory, so we need to make recursive calls. Could be optimized slightly by + // adding a dedicated flag + inline bool IsOwned() const + { + if (m_IsOwned) + { + return true; + } + else + { + return m_Outer && m_Outer->IsOwned(); + } + } + + BufferOwner(const BufferOwner&) = delete; + BufferOwner& operator=(const BufferOwner&) = delete; + +private: + bool m_IsOwned; + void* m_Data; + uint64_t m_Size; + RefPtr<BufferOwner> m_Outer; + + friend class UniqueBuffer; + friend class SharedBuffer; +}; + +/** + * Reference to a memory buffer with a single owner (see std::unique_ptr) + */ +class UniqueBuffer +{ +public: + UniqueBuffer(const UniqueBuffer&) = delete; + UniqueBuffer& operator=(const UniqueBuffer&) = delete; + + UniqueBuffer() = default; + ZENCORE_API explicit UniqueBuffer(BufferOwner* Owner); + + void* GetData() { return m_buffer->m_Data; } + const void* GetData() const { return m_buffer->m_Data; } + size_t GetSize() const { return m_buffer->m_Size; } + operator MutableMemoryView() { return GetView(); } + operator MemoryView() const { return MemoryView(m_buffer->m_Data, m_buffer->m_Size); } + + MutableMemoryView GetView() { return MutableMemoryView(m_buffer->m_Data, m_buffer->m_Size); } + + /** Make an uninitialized owned buffer of the specified size. */ + ZENCORE_API static UniqueBuffer Alloc(uint64_t Size); + + /** Make a non-owned view of the input. */ + ZENCORE_API static UniqueBuffer MakeView(void* DataPtr, uint64_t Size); + +private: + RefPtr<BufferOwner> m_buffer; + + friend class SharedBuffer; +}; + +/** + * Reference to a memory buffer with shared ownership + */ +class SharedBuffer +{ +public: + SharedBuffer() = default; + ZENCORE_API explicit SharedBuffer(UniqueBuffer&&); + inline explicit SharedBuffer(BufferOwner* Owner) : m_buffer(Owner) {} + + void* GetData() + { + if (m_buffer) + { + return m_buffer->m_Data; + } + return nullptr; + } + + const void* GetData() const + { + if (m_buffer) + { + return m_buffer->m_Data; + } + return nullptr; + } + + size_t GetSize() const + { + if (m_buffer) + { + return m_buffer->m_Size; + } + return 0; + } + + ZENCORE_API void MakeOwned(); + bool IsOwned() const { return m_buffer && m_buffer->IsOwned(); } + inline explicit operator bool() const { return m_buffer; } + inline bool IsNull() const { return !m_buffer; } + inline void Reset() { m_buffer = nullptr; } + + MemoryView GetView() const + { + if (m_buffer) + { + return MemoryView(m_buffer->m_Data, m_buffer->m_Size); + } + else + { + return MemoryView(); + } + } + + operator MemoryView() const { return GetView(); } + + SharedBuffer& operator=(UniqueBuffer&& Rhs) + { + m_buffer = std::move(Rhs.m_buffer); + return *this; + } + + std::strong_ordering operator<=>(const SharedBuffer& Rhs) const = default; + + /** Make a non-owned view of the input */ + inline static SharedBuffer MakeView(MemoryView View) { return MakeView(View.GetData(), View.GetSize()); } + /** Make a non-owned view of the input */ + ZENCORE_API static SharedBuffer MakeView(const void* Data, uint64_t Size); + /** Make a non-owned view of the input */ + ZENCORE_API static SharedBuffer MakeView(MemoryView View, SharedBuffer Buffer); + /** Make am owned clone of the buffer */ + ZENCORE_API SharedBuffer Clone(); + /** Make an owned clone of the memory in the input view */ + ZENCORE_API static SharedBuffer Clone(MemoryView View); + +private: + RefPtr<BufferOwner> m_buffer; +}; + +void sharedbuffer_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/snapshot_manifest.h b/zencore/include/zencore/snapshot_manifest.h new file mode 100644 index 000000000..95e64773a --- /dev/null +++ b/zencore/include/zencore/snapshot_manifest.h @@ -0,0 +1,57 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iohash.h> +#include <zencore/zencore.h> + +#include <filesystem> +#include <functional> +#include <string> +#include <vector> + +namespace zen { + +struct LeafNode +{ + uint64_t FileSize = 0; + uint64_t FileModifiedTime = 0; + zen::IoHash ChunkHash = zen::IoHash::Zero; + std::wstring Name; +}; + +struct TreeNode +{ + std::vector<TreeNode> Children; + std::vector<LeafNode> Leaves; + std::wstring Name; + zen::BLAKE3 ChunkHash = zen::BLAKE3::Zero; + + ZENCORE_API void VisitModifyFiles(std::function<void(LeafNode& node)> func); + ZENCORE_API void VisitFiles(std::function<void(const LeafNode& node)> func); + ZENCORE_API void Finalize(); +}; + +struct SnapshotManifest +{ + std::string Id; + TreeNode Root; + zen::BLAKE3 ChunkHash = zen::BLAKE3::Zero; + + ZENCORE_API void finalize(); +}; + +class InStream; +class OutStream; + +ZENCORE_API void ReadManifest(SnapshotManifest& Manifest, InStream& FromStream); +ZENCORE_API void WriteManifest(const SnapshotManifest& Manifest, OutStream& ToStream); +ZENCORE_API void PrintManifest(const SnapshotManifest& Manifest, OutStream& ToStream); + +// Translate a user-provided manifest specification into a file path. +// Supports hashtag syntax to implicitly refer to user documents zenfs folder +ZENCORE_API std::filesystem::path ManifestSpecToPath(const char* ManifestSpec); + +void snapshotmanifest_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/stats.h b/zencore/include/zencore/stats.h new file mode 100644 index 000000000..7290fd914 --- /dev/null +++ b/zencore/include/zencore/stats.h @@ -0,0 +1,66 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <atomic> +#include <type_traits> +#include "zencore.h" + +namespace zen { + +template<typename T> +class Gauge +{ +public: + Gauge() : m_value{0} {} + +private: + T m_value; +}; + +class Counter +{ +public: + inline void SetValue(uint64_t Value) { m_count = Value; } + inline uint64_t Value() const { return m_count; } + + inline void Increment(int64_t AddValue) { m_count += AddValue; } + inline void Decrement(int64_t SubValue) { m_count -= SubValue; } + inline void Clear() { m_count = 0; } + +private: + std::atomic_uint64_t m_count{0}; +}; + +/// <summary> +/// Exponential Weighted Moving Average +/// </summary> +class EWMA +{ +public: + /// <summary> + /// Update EWMA with new measure + /// </summary> + /// <param name="Alpha">Smoothing factor (between 0 and 1)</param> + /// <param name="Interval">Elapsed time since last</param> + /// <param name="Count">Value</param> + /// <param name="IsInitialUpdate">Whether this is the first update or not</param> + void Tick(double Alpha, uint64_t Interval, uint64_t Count, bool IsInitialUpdate); + double Rate() const; + +private: + double m_rate = 0; +}; + +/// <summary> +/// Tracks rate of events over time (i.e requests/sec) +/// </summary> +class Meter +{ +public: +private: +}; + +extern void stats_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/stream.h b/zencore/include/zencore/stream.h new file mode 100644 index 000000000..4e8c58382 --- /dev/null +++ b/zencore/include/zencore/stream.h @@ -0,0 +1,318 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/memory.h> +#include <zencore/refcount.h> +#include <zencore/thread.h> + +#include <string_view> +#include <vector> + +namespace zen { + +/** + * Basic byte stream interface + * + * This is intended as a minimal base class offering only the absolute minimum of functionality. + * + * IMPORTANT: To better support concurrency, this abstraction offers no "file pointer". Thus + * every read or write operation needs to specify the offset from which they wish to read. + * + * Most client code will likely want to use reader/writer classes like BinaryWriter/BinaryReader + * + */ +class OutStream : public RefCounted +{ +public: + virtual void Write(const void* Data, size_t ByteCount, uint64_t Offset) = 0; + virtual void Flush() = 0; +}; + +class InStream : public RefCounted +{ +public: + virtual void Read(void* DataPtr, size_t ByteCount, uint64_t Offset) = 0; + virtual uint64_t Size() const = 0; +}; + +/** + * Stream which writes into a growing memory buffer + */ +class MemoryOutStream : public OutStream +{ +public: + MemoryOutStream() = default; + ~MemoryOutStream() = default; + + virtual void Write(const void* DataPtr, size_t ByteCount, uint64_t Offset) override; + virtual void Flush() override; + inline const uint8_t* Data() const { return m_Buffer.data(); } + inline uint64_t Size() const { return m_Buffer.size(); } + +private: + RwLock m_Lock; + std::vector<uint8_t> m_Buffer; +}; + +inline MemoryView +MakeMemoryView(const MemoryOutStream& Stream) +{ + return MemoryView(Stream.Data(), Stream.Size()); +} + +/** + * Stream which reads from a memory buffer + */ +class MemoryInStream : public InStream +{ +public: + MemoryInStream(const void* Buffer, size_t Size); + MemoryInStream(MemoryView View) : MemoryInStream(View.GetData(), View.GetSize()) {} + ~MemoryInStream() = default; + + virtual void Read(void* DataPtr, size_t ByteCount, uint64_t ReadOffset) override; + virtual uint64_t Size() const override { return m_Buffer.size(); } + inline const uint8_t* Data() const { return m_Buffer.data(); } + +private: + RwLock m_Lock; + std::vector<uint8_t> m_Buffer; +}; + +/** + * Binary stream writer + */ + +class BinaryWriter +{ +public: + inline BinaryWriter(OutStream& Stream) : m_Stream(&Stream) {} + ~BinaryWriter() = default; + + inline void Write(const void* DataPtr, size_t ByteCount) + { + m_Stream->Write(DataPtr, ByteCount, m_Offset); + m_Offset += ByteCount; + } + + uint64_t CurrentOffset() const { return m_Offset; } + +private: + RefPtr<OutStream> m_Stream; + uint64_t m_Offset = 0; +}; + +inline BinaryWriter& +operator<<(BinaryWriter& Writer, bool Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, int8_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, int16_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, int32_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, int64_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, uint8_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, uint16_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, uint32_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} +inline BinaryWriter& +operator<<(BinaryWriter& Writer, uint64_t Value) +{ + Writer.Write(&Value, sizeof Value); + return Writer; +} + +/** + * Binary stream reader + */ + +class BinaryReader +{ +public: + inline BinaryReader(InStream& Stream) : m_Stream(&Stream) {} + ~BinaryReader() = default; + + inline void Read(void* DataPtr, size_t ByteCount) + { + m_Stream->Read(DataPtr, ByteCount, m_Offset); + m_Offset += ByteCount; + } + + void Seek(uint64_t Offset) + { + ZEN_ASSERT(Offset <= m_Stream->Size()); + m_Offset = Offset; + } + + void Skip(uint64_t SkipOffset) + { + ZEN_ASSERT((m_Offset + SkipOffset) <= m_Stream->Size()); + m_Offset += SkipOffset; + } + + inline uint64_t CurrentOffset() const { return m_Offset; } + inline uint64_t AvailableBytes() const { return m_Stream->Size() - m_Offset; } + +private: + RefPtr<InStream> m_Stream; + uint64_t m_Offset = 0; +}; + +inline BinaryReader& +operator>>(BinaryReader& Reader, bool& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, int8_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, int16_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, int32_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, int64_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, uint8_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, uint16_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, uint32_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} +inline BinaryReader& +operator>>(BinaryReader& Reader, uint64_t& Value) +{ + Reader.Read(&Value, sizeof Value); + return Reader; +} + +/** + * Text stream writer + */ + +class TextWriter +{ +public: + ZENCORE_API TextWriter(OutStream& Stream); + ZENCORE_API ~TextWriter(); + + ZENCORE_API virtual void Write(const void* DataPtr, size_t ByteCount); + ZENCORE_API void Writef(const char* FormatString, ...); + + inline uint64_t CurrentOffset() const { return m_CurrentOffset; } + +private: + RefPtr<OutStream> m_Stream; + uint64_t m_CurrentOffset = 0; +}; + +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, const char* Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, const std::string_view& Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, bool Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, int8_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, int16_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, int32_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, int64_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, uint8_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, uint16_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, uint32_t Value); +ZENCORE_API TextWriter& operator<<(TextWriter& Writer, uint64_t Value); + +class IndentTextWriter : public TextWriter +{ +public: + ZENCORE_API IndentTextWriter(OutStream& stream); + ZENCORE_API ~IndentTextWriter(); + + ZENCORE_API virtual void Write(const void* DataPtr, size_t ByteCount) override; + + inline void Indent(int Amount) { m_IndentAmount += Amount; } + + struct Scope + { + Scope(IndentTextWriter& Outer, int IndentAmount = 2) : m_Outer(Outer), m_IndentAmount(IndentAmount) + { + m_Outer.Indent(IndentAmount); + } + + ~Scope() { m_Outer.Indent(-m_IndentAmount); } + + private: + IndentTextWriter& m_Outer; + int m_IndentAmount; + }; + +private: + int m_IndentAmount = 0; + int m_LineCursor = 0; + char m_LineBuffer[2048]; +}; + +void stream_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/streamutil.h b/zencore/include/zencore/streamutil.h new file mode 100644 index 000000000..190cd18eb --- /dev/null +++ b/zencore/include/zencore/streamutil.h @@ -0,0 +1,118 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <fmt/format.h> +#include <zencore/string.h> +#include <string> +#include <string_view> + +#include "blake3.h" +#include "iohash.h" +#include "sha1.h" +#include "stream.h" + +namespace zen { + +ZENCORE_API BinaryWriter& operator<<(BinaryWriter& writer, const std::string_view& value); +ZENCORE_API BinaryReader& operator>>(BinaryReader& reader, std::string& value); + +ZENCORE_API BinaryWriter& operator<<(BinaryWriter& writer, const std::wstring_view& value); +ZENCORE_API BinaryReader& operator>>(BinaryReader& reader, std::wstring& value); +ZENCORE_API TextWriter& operator<<(TextWriter& writer, const std::wstring_view& value); + +inline BinaryWriter& +operator<<(BinaryWriter& writer, const SHA1& value) +{ + writer.Write(value.Hash, sizeof value.Hash); + return writer; +} +inline BinaryReader& +operator>>(BinaryReader& reader, SHA1& value) +{ + reader.Read(value.Hash, sizeof value.Hash); + return reader; +} +ZENCORE_API TextWriter& operator<<(TextWriter& writer, const zen::SHA1& value); + +inline BinaryWriter& +operator<<(BinaryWriter& writer, const BLAKE3& value) +{ + writer.Write(value.Hash, sizeof value.Hash); + return writer; +} +inline BinaryReader& +operator>>(BinaryReader& reader, BLAKE3& value) +{ + reader.Read(value.Hash, sizeof value.Hash); + return reader; +} +ZENCORE_API TextWriter& operator<<(TextWriter& writer, const BLAKE3& value); + +inline BinaryWriter& +operator<<(BinaryWriter& writer, const IoHash& value) +{ + writer.Write(value.Hash, sizeof value.Hash); + return writer; +} +inline BinaryReader& +operator>>(BinaryReader& reader, IoHash& value) +{ + reader.Read(value.Hash, sizeof value.Hash); + return reader; +} +ZENCORE_API TextWriter& operator<<(TextWriter& writer, const IoHash& value); + +} // namespace zen + +////////////////////////////////////////////////////////////////////////// + +template<> +struct fmt::formatter<zen::IoHash> +{ + constexpr auto parse(format_parse_context& ctx) + { + // Parse the presentation format and store it in the formatter: + auto it = ctx.begin(), end = ctx.end(); + + // Check if reached the end of the range: + if (it != end && *it != '}') + throw format_error("invalid format"); + + // Return an iterator past the end of the parsed range: + return it; + } + + template<typename FormatContext> + auto format(const zen::IoHash& h, FormatContext& ctx) + { + zen::ExtendableStringBuilder<48> String; + h.ToHexString(String); + return format_to(ctx.out(), std::string_view(String)); + } +}; + +template<> +struct fmt::formatter<zen::BLAKE3> +{ + constexpr auto parse(format_parse_context& ctx) + { + // Parse the presentation format and store it in the formatter: + auto it = ctx.begin(), end = ctx.end(); + + // Check if reached the end of the range: + if (it != end && *it != '}') + throw format_error("invalid format"); + + // Return an iterator past the end of the parsed range: + return it; + } + + template<typename FormatContext> + auto format(const zen::BLAKE3& h, FormatContext& ctx) + { + zen::ExtendableStringBuilder<80> String; + h.ToHexString(String); + return format_to(ctx.out(), std::string_view(String)); + } +}; diff --git a/zencore/include/zencore/string.h b/zencore/include/zencore/string.h new file mode 100644 index 000000000..d7727ca08 --- /dev/null +++ b/zencore/include/zencore/string.h @@ -0,0 +1,595 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "intmath.h" +#include "zencore.h" + +#include <stdint.h> +#include <string.h> +#include <charconv> +#include <codecvt> +#include <concepts> +#include <optional> +#include <span> +#include <string_view> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +inline bool +StringEquals(const char8_t* s1, const char* s2) +{ + return strcmp(reinterpret_cast<const char*>(s1), s2) == 0; +} + +inline bool +StringEquals(const char* s1, const char* s2) +{ + return strcmp(s1, s2) == 0; +} + +inline size_t +StringLength(const char* str) +{ + return strlen(str); +} + +inline bool +StringEquals(const wchar_t* s1, const wchar_t* s2) +{ + return wcscmp(s1, s2) == 0; +} + +inline size_t +StringLength(const wchar_t* str) +{ + return wcslen(str); +} + +////////////////////////////////////////////////////////////////////////// +// File name helpers +// + +ZENCORE_API const char* FilepathFindExtension(const std::string_view& path, const char* extensionToMatch = nullptr); + +////////////////////////////////////////////////////////////////////////// +// Text formatting of numbers +// + +ZENCORE_API bool ToString(std::span<char> Buffer, uint64_t Num); +ZENCORE_API bool ToString(std::span<char> Buffer, int64_t Num); + +struct TextNumBase +{ + inline const char* c_str() const { return m_Buffer; } + inline operator std::string_view() const { return std::string_view(m_Buffer); } + +protected: + char m_Buffer[24]; +}; + +struct IntNum : public TextNumBase +{ + inline IntNum(std::unsigned_integral auto Number) { ToString(m_Buffer, uint64_t(Number)); } + inline IntNum(std::signed_integral auto Number) { ToString(m_Buffer, int64_t(Number)); } +}; + +////////////////////////////////////////////////////////////////////////// +// +// Quick-and-dirty string builder. Good enough for me, but contains traps +// and not-quite-ideal behaviour especially when mixing character types etc +// + +template<typename C> +class StringBuilderImpl +{ +public: + StringBuilderImpl() = default; + ZENCORE_API ~StringBuilderImpl(); + + StringBuilderImpl(const StringBuilderImpl&) = delete; + StringBuilderImpl(const StringBuilderImpl&&) = delete; + const StringBuilderImpl& operator=(const StringBuilderImpl&) = delete; + const StringBuilderImpl& operator=(const StringBuilderImpl&&) = delete; + + StringBuilderImpl& Append(C OneChar) + { + EnsureCapacity(1); + + *m_CurPos++ = OneChar; + + return *this; + } + + inline StringBuilderImpl& AppendAscii(const std::string_view& String) + { + const size_t len = String.size(); + + EnsureCapacity(len); + + for (size_t i = 0; i < len; ++i) + m_CurPos[i] = String[i]; + + m_CurPos += len; + + return *this; + } + + inline StringBuilderImpl& AppendAscii(const std::u8string_view& String) + { + const size_t len = String.size(); + + EnsureCapacity(len); + + for (size_t i = 0; i < len; ++i) + m_CurPos[i] = String[i]; + + m_CurPos += len; + + return *this; + } + + inline StringBuilderImpl& AppendAscii(const char* NulTerminatedString) + { + size_t StringLen = StringLength(NulTerminatedString); + + return AppendAscii({NulTerminatedString, StringLen}); + } + + inline StringBuilderImpl& Append(const char8_t* NulTerminatedString) + { + // This is super hacky and not fully functional - needs better + // solution + if constexpr (sizeof(C) == 1) + { + size_t len = StringLength((const char*)NulTerminatedString); + + EnsureCapacity(len); + + for (size_t i = 0; i < len; ++i) + m_CurPos[i] = C(NulTerminatedString[i]); + + m_CurPos += len; + } + else + { + ZEN_NOT_IMPLEMENTED(); + } + + return *this; + } + + inline StringBuilderImpl& AppendAsciiRange(const char* BeginString, const char* EndString) + { + EnsureCapacity(EndString - BeginString); + + while (BeginString != EndString) + *m_CurPos++ = *BeginString++; + + return *this; + } + + inline StringBuilderImpl& Append(const C* NulTerminatedString) + { + size_t Len = StringLength(NulTerminatedString); + + EnsureCapacity(Len); + memcpy(m_CurPos, NulTerminatedString, Len * sizeof(C)); + m_CurPos += Len; + + return *this; + } + + inline StringBuilderImpl& Append(const C* NulTerminatedString, size_t MaxChars) + { + size_t len = Min(MaxChars, StringLength(NulTerminatedString)); + + EnsureCapacity(len); + memcpy(m_CurPos, NulTerminatedString, len * sizeof(C)); + m_CurPos += len; + + return *this; + } + + inline StringBuilderImpl& AppendRange(const C* BeginString, const C* EndString) + { + size_t Len = EndString - BeginString; + + EnsureCapacity(Len); + memcpy(m_CurPos, BeginString, Len * sizeof(C)); + m_CurPos += Len; + + return *this; + } + + inline StringBuilderImpl& Append(const std::basic_string_view<C>& String) + { + return AppendRange(String.data(), String.data() + String.size()); + } + + inline const C* c_str() const + { + EnsureNulTerminated(); + return m_Base; + } + + inline C* Data() + { + EnsureNulTerminated(); + return m_Base; + } + + inline const C* Data() const + { + EnsureNulTerminated(); + return m_Base; + } + + inline size_t Size() const { return m_CurPos - m_Base; } + inline bool IsDynamic() const { return m_IsDynamic; } + inline void Reset() { m_CurPos = m_Base; } + + inline StringBuilderImpl& operator<<(uint64_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(int64_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(uint32_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(int32_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(uint16_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(int16_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(uint8_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + inline StringBuilderImpl& operator<<(int8_t n) + { + IntNum Str(n); + return AppendAscii(Str); + } + + inline StringBuilderImpl& operator<<(const char* str) { return AppendAscii(str); } + inline StringBuilderImpl& operator<<(const std::string_view str) { return AppendAscii(str); } + inline StringBuilderImpl& operator<<(const std::u8string_view str) { return AppendAscii(str); } + inline StringBuilderImpl& operator<<(bool v) + { + using namespace std::literals; + if (v) + { + return AppendAscii("true"sv); + } + return AppendAscii("false"sv); + } + +protected: + inline void Init(C* Base, size_t Capacity) + { + m_Base = m_CurPos = Base; + m_End = Base + Capacity; + } + + inline void EnsureNulTerminated() const { *m_CurPos = '\0'; } + + inline void EnsureCapacity(size_t ExtraRequired) + { + // precondition: we know the current buffer has enough capacity + // for the existing string including NUL terminator + + if ((m_CurPos + ExtraRequired) < m_End) + return; + + Extend(ExtraRequired); + } + + ZENCORE_API void Extend(size_t ExtraCapacity); + ZENCORE_API void* AllocBuffer(size_t ByteCount); + ZENCORE_API void FreeBuffer(void* Buffer, size_t ByteCount); + + ZENCORE_API [[noreturn]] void Fail(const char* FailReason); // note: throws exception + + C* m_Base; + C* m_CurPos; + C* m_End; + bool m_IsDynamic = false; + bool m_IsExtendable = false; +}; + +////////////////////////////////////////////////////////////////////////// + +extern template StringBuilderImpl<char>; + +class StringBuilderBase : public StringBuilderImpl<char> +{ +public: + inline StringBuilderBase(char* bufferPointer, size_t bufferCapacity) { Init(bufferPointer, bufferCapacity); } + inline ~StringBuilderBase() = default; + + // Note that we don't need a terminator for the string_view so we avoid calling data() here + inline operator std::string_view() const { return std::string_view(m_Base, m_CurPos - m_Base); } + inline std::string_view ToView() const { return std::string_view(m_Base, m_CurPos - m_Base); } + inline std::string ToString() const { return std::string{Data(), Size()}; } + + inline void AppendCodepoint(uint32_t cp) + { + if (cp < 0x80) // one octet + { + Append(static_cast<char8_t>(cp)); + } + else if (cp < 0x800) + { + EnsureCapacity(2); // two octets + m_CurPos[0] = static_cast<char8_t>((cp >> 6) | 0xc0); + m_CurPos[1] = static_cast<char8_t>((cp & 0x3f) | 0x80); + m_CurPos += 2; + } + else if (cp < 0x10000) + { + EnsureCapacity(3); // three octets + m_CurPos[0] = static_cast<char8_t>((cp >> 12) | 0xe0); + m_CurPos[1] = static_cast<char8_t>(((cp >> 6) & 0x3f) | 0x80); + m_CurPos[2] = static_cast<char8_t>((cp & 0x3f) | 0x80); + m_CurPos += 3; + } + else + { + EnsureCapacity(4); // four octets + m_CurPos[0] = static_cast<char8_t>((cp >> 18) | 0xf0); + m_CurPos[1] = static_cast<char8_t>(((cp >> 12) & 0x3f) | 0x80); + m_CurPos[2] = static_cast<char8_t>(((cp >> 6) & 0x3f) | 0x80); + m_CurPos[3] = static_cast<char8_t>((cp & 0x3f) | 0x80); + m_CurPos += 4; + } + } +}; + +template<size_t N> +class StringBuilder : public StringBuilderBase +{ +public: + inline StringBuilder() : StringBuilderBase(m_StringBuffer, sizeof m_StringBuffer) {} + inline ~StringBuilder() = default; + +private: + char m_StringBuffer[N]; +}; + +template<size_t N> +class ExtendableStringBuilder : public StringBuilderBase +{ +public: + inline ExtendableStringBuilder() : StringBuilderBase(m_StringBuffer, sizeof m_StringBuffer) { m_IsExtendable = true; } + inline ~ExtendableStringBuilder() = default; + +private: + char m_StringBuffer[N]; +}; + +////////////////////////////////////////////////////////////////////////// + +extern template StringBuilderImpl<wchar_t>; + +class WideStringBuilderBase : public StringBuilderImpl<wchar_t> +{ +public: + inline WideStringBuilderBase(wchar_t* BufferPointer, size_t BufferCapacity) { Init(BufferPointer, BufferCapacity); } + inline ~WideStringBuilderBase() = default; + + inline operator std::wstring_view() const { return std::wstring_view{Data(), Size()}; } + inline std::wstring_view ToView() const { return std::wstring_view{Data(), Size()}; } + inline std::wstring toString() const { return std::wstring{Data(), Size()}; } + + inline StringBuilderImpl& operator<<(const std::u16string_view str) { return Append((const wchar_t*)str.data(), str.size()); } + inline StringBuilderImpl& operator<<(const wchar_t* str) { return Append(str); } + using StringBuilderImpl:: operator<<; +}; + +template<size_t N> +class WideStringBuilder : public WideStringBuilderBase +{ +public: + inline WideStringBuilder() : WideStringBuilderBase(m_Buffer, N) {} + ~WideStringBuilder() = default; + +private: + wchar_t m_Buffer[N]; +}; + +template<size_t N> +class ExtendableWideStringBuilder : public WideStringBuilderBase +{ +public: + inline ExtendableWideStringBuilder() : WideStringBuilderBase(m_Buffer, N) { m_IsExtendable = true; } + ~ExtendableWideStringBuilder() = default; + +private: + wchar_t m_Buffer[N]; +}; + +////////////////////////////////////////////////////////////////////////// + +void Utf8ToWide(const char8_t* str, WideStringBuilderBase& out); +void Utf8ToWide(const std::u8string_view& wstr, WideStringBuilderBase& out); +void Utf8ToWide(const std::string_view& wstr, WideStringBuilderBase& out); +std::wstring Utf8ToWide(const std::string_view& wstr); + +void WideToUtf8(const wchar_t* wstr, StringBuilderBase& out); +std::string WideToUtf8(const wchar_t* wstr); +void WideToUtf8(const std::u16string_view& wstr, StringBuilderBase& out); +void WideToUtf8(const std::wstring_view& wstr, StringBuilderBase& out); +std::string WideToUtf8(const std::wstring_view Wstr); + +/// <summary> +/// Parse hex string into a byte buffer +/// </summary> +/// <param name="string">Input string</param> +/// <param name="characterCount">Number of characters in string</param> +/// <param name="outPtr">Pointer to output buffer</param> +/// <returns>true if the input consisted of all valid hexadecimal characters</returns> + +inline bool +ParseHexBytes(const char* InputString, size_t CharacterCount, uint8_t* OutPtr) +{ + ZEN_ASSERT((CharacterCount & 1) == 0); + + auto char2nibble = [](char c) { + uint8_t c8 = uint8_t(c - '0'); + + if (c8 < 10) + return c8; + + c8 -= 'A' - '0' - 10; + + if (c8 < 16) + return c8; + + c8 -= 'a' - 'A'; + + if (c8 < 16) + return c8; + + return uint8_t(0xff); + }; + + uint8_t allBits = 0; + + while (CharacterCount) + { + uint8_t n0 = char2nibble(InputString[0]); + uint8_t n1 = char2nibble(InputString[1]); + + allBits |= n0 | n1; + + *OutPtr = (n0 << 4) | n1; + + OutPtr += 1; + InputString += 2; + CharacterCount -= 2; + } + + return (allBits & 0x80) == 0; +} + +inline void +ToHexBytes(const uint8_t* InputData, size_t ByteCount, char* OutString) +{ + const char hexchars[] = "0123456789abcdef"; + + while (ByteCount--) + { + uint8_t byte = *InputData++; + + *OutString++ = hexchars[byte >> 4]; + *OutString++ = hexchars[byte & 15]; + } +} + +////////////////////////////////////////////////////////////////////////// +// Format numbers for humans +// + +ZENCORE_API size_t NiceNumToBuffer(uint64_t Num, std::span<char> Buffer); +ZENCORE_API size_t NiceBytesToBuffer(uint64_t Num, std::span<char> Buffer); +ZENCORE_API size_t NiceByteRateToBuffer(uint64_t Num, uint64_t ms, std::span<char> Buffer); +ZENCORE_API size_t NiceLatencyNsToBuffer(uint64_t NanoSeconds, std::span<char> Buffer); +ZENCORE_API size_t NiceTimeSpanMsToBuffer(uint64_t Milliseconds, std::span<char> Buffer); + +struct NiceBase +{ + inline const char* c_str() const { return m_Buffer; } + inline operator std::string_view() const { return std::string_view(m_Buffer); } + +protected: + char m_Buffer[16]; +}; + +struct NiceNum : public NiceBase +{ + inline NiceNum(uint64_t Num) { NiceNumToBuffer(Num, m_Buffer); } +}; + +struct NiceBytes : public NiceBase +{ + inline NiceBytes(uint64_t Num) { NiceBytesToBuffer(Num, m_Buffer); } +}; + +struct NiceByteRate : public NiceBase +{ + inline NiceByteRate(uint64_t Bytes, uint64_t TimeMilliseconds) { NiceByteRateToBuffer(Bytes, TimeMilliseconds, m_Buffer); } +}; + +struct NiceLatencyNs : public NiceBase +{ + inline NiceLatencyNs(uint64_t Milliseconds) { NiceLatencyNsToBuffer(Milliseconds, m_Buffer); } +}; + +struct NiceTimeSpanMs : public NiceBase +{ + inline NiceTimeSpanMs(uint64_t Milliseconds) { NiceTimeSpanMsToBuffer(Milliseconds, m_Buffer); } +}; + +////////////////////////////////////////////////////////////////////////// + +inline std::string +NiceRate(uint64_t Num, uint32_t DurationMilliseconds, const char* Unit = "B") +{ + char Buffer[32]; + + if (DurationMilliseconds) + { + NiceNumToBuffer(Num * 1000 / DurationMilliseconds, Buffer); + } + else + { + strcpy_s(Buffer, "0"); + } + + strcat_s(Buffer, Unit); + strcat_s(Buffer, "/s"); + + return Buffer; +} + +////////////////////////////////////////////////////////////////////////// + +template<std::integral T> +std::optional<T> +ParseInt(const std::string_view& Input) +{ + T Out; + const std::from_chars_result Result = std::from_chars(Input.data(), Input.data() + Input.size(), Out); + if (Result.ec == std::errc::invalid_argument || Result.ec == std::errc::result_out_of_range) + { + return std::nullopt; + } + return Out; +} + +////////////////////////////////////////////////////////////////////////// + +void string_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/thread.h b/zencore/include/zencore/thread.h new file mode 100644 index 000000000..48afad33f --- /dev/null +++ b/zencore/include/zencore/thread.h @@ -0,0 +1,118 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +namespace zen { + +/** + * Reader-writer lock + * + * - A single thread may hold an exclusive lock at any given moment + * + * - Multiple threads may hold shared locks, but only if no thread has + * acquired an exclusive lock + */ +class RwLock +{ +public: + ZENCORE_API void AcquireShared(); + ZENCORE_API void ReleaseShared(); + + ZENCORE_API void AcquireExclusive(); + ZENCORE_API void ReleaseExclusive(); + + struct SharedLockScope + { + SharedLockScope(RwLock& lock) : m_Lock(lock) { m_Lock.AcquireShared(); } + ~SharedLockScope() { m_Lock.ReleaseShared(); } + + private: + RwLock& m_Lock; + }; + + struct ExclusiveLockScope + { + ExclusiveLockScope(RwLock& lock) : m_Lock(lock) { m_Lock.AcquireExclusive(); } + ~ExclusiveLockScope() { m_Lock.ReleaseExclusive(); } + + private: + RwLock& m_Lock; + }; + +private: + void* m_Srw = nullptr; +}; + +/** Basic abstraction of a simple event synchronization mechanism (aka 'binary semaphore') + */ +class Event +{ +public: + ZENCORE_API Event(); + ZENCORE_API ~Event(); + + Event(Event&& Rhs) : m_EventHandle(Rhs.m_EventHandle) { Rhs.m_EventHandle = nullptr; } + + Event(const Event& Rhs) = delete; + Event& operator=(const Event& Rhs) = delete; + + inline Event& operator=(Event&& Rhs) + { + m_EventHandle = Rhs.m_EventHandle; + Rhs.m_EventHandle = nullptr; + return *this; + } + + ZENCORE_API void Set(); + ZENCORE_API void Reset(); + ZENCORE_API bool Wait(int TimeoutMs = -1); + +protected: + explicit Event(void* EventHandle) : m_EventHandle(EventHandle) {} + + void* m_EventHandle = nullptr; +}; + +/** Basic abstraction of an IPC mechanism (aka 'binary semaphore') + */ +class NamedEvent : public Event +{ +public: + ZENCORE_API explicit NamedEvent(std::string_view EventName); + ZENCORE_API explicit NamedEvent(std::u8string_view EventName); +}; + +/** Basic process abstraction + */ +class Process +{ +public: + ZENCORE_API Process(); + + Process(const Process&) = delete; + Process& operator=(const Process&) = delete; + + ZENCORE_API ~Process(); + + ZENCORE_API void Initialize(int Pid); + ZENCORE_API void Initialize(void* ProcessHandle); /// Initialize with an existing handle - takes ownership of the handle + ZENCORE_API bool IsRunning() const; + ZENCORE_API bool IsValid() const; + ZENCORE_API bool Wait(int TimeoutMs = -1); + ZENCORE_API void Terminate(int ExitCode); + inline int Pid() const { return m_Pid; } + +private: + void* m_ProcessHandle = nullptr; + int m_Pid = 0; +}; + +ZENCORE_API bool IsProcessRunning(int pid); + +ZENCORE_API void Sleep(int ms); + +void thread_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/timer.h b/zencore/include/zencore/timer.h new file mode 100644 index 000000000..c9122eb44 --- /dev/null +++ b/zencore/include/zencore/timer.h @@ -0,0 +1,41 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <intrin.h> +#include <stdint.h> +#include "zencore.h" + +namespace zen { + +// High frequency timers + +ZENCORE_API uint64_t GetHifreqTimerValue(); +ZENCORE_API uint64_t GetHifreqTimerFrequency(); +ZENCORE_API uint64_t GetHifreqTimerFrequencySafe(); // May be used during static init + +class Stopwatch +{ +public: + Stopwatch() : m_StartValue(GetHifreqTimerValue()) {} + + inline uint64_t getElapsedTimeMs() { return (GetHifreqTimerValue() - m_StartValue) * 1000 / GetHifreqTimerFrequency(); } + + inline void reset() { m_StartValue = GetHifreqTimerValue(); } + +private: + uint64_t m_StartValue; +}; + +// CPU timers + +inline uint64_t +GetCpuTimerValue() +{ + unsigned int foo; + return __rdtscp(&foo); +} + +void timer_forcelink(); // internal + +} // namespace zen diff --git a/zencore/include/zencore/trace.h b/zencore/include/zencore/trace.h new file mode 100644 index 000000000..191ce4a3a --- /dev/null +++ b/zencore/include/zencore/trace.h @@ -0,0 +1,91 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <inttypes.h> +#include "zencore.h" + +#pragma section("trace_events", read) +#define U_TRACE_DECL __declspec(allocate("trace_events")) + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +struct TraceSite +{ + const char* sourceFile; + const uint32_t sourceLine; + const uint32_t flags; +}; + +struct TraceEvent +{ + const TraceSite* site; + ThreadId_t threadId; + const char* message; +}; + +enum TraceFlags +{ + kTrace_Debug = 1 << 0, + kTrace_Info = 1 << 1, + kTrace_Warn = 1 << 2, + kTrace_Error = 1 << 3, + kTrace_Fatal = 1 << 4, + + kTrace_Trace = 1 << 7, +}; + +class Tracer +{ +public: + void Log(const TraceEvent& e); + + __forceinline uint32_t Accept(const TraceSite& e) const { return (m_acceptFlags & e.flags); } + +private: + uint32_t m_acceptFlags = ~0u; +}; + +ZENCORE_API extern Tracer g_globalTracer; + +/** Trace event handler + */ +class TraceHandler +{ +public: + virtual void Trace(const TraceEvent& e) = 0; + +private: +}; + +ZENCORE_API static void TraceBroadcast(const TraceEvent& e); + +void trace_forcelink(); // internal + +} // namespace zen + +__forceinline zen::Tracer& +CurrentTracer() +{ + return zen::g_globalTracer; +} + +#define U_LOG_GENERIC(msg, flags) \ + do \ + { \ + zen::Tracer& t = CurrentTracer(); \ + static U_TRACE_DECL constexpr zen::TraceSite traceSite{__FILE__, __LINE__, flags}; \ + const zen::TraceEvent traceEvent = {&traceSite, 0u, msg}; \ + if (t.Accept(traceSite)) \ + t.Log(traceEvent); \ + } while (false) + +////////////////////////////////////////////////////////////////////////// + +#define U_LOG_DEBUG(msg) U_LOG_GENERIC(msg, zen::kTrace_Debug) +#define U_LOG_INFO(msg) U_LOG_GENERIC(msg, zen::kTrace_Info) +#define U_LOG_WARN(msg) U_LOG_GENERIC(msg, zen::kTrace_Warn) +#define U_LOG_ERROR(msg) U_LOG_GENERIC(msg, zen::kTrace_Error) +#define U_LOG_FATAL(msg) U_LOG_GENERIC(msg, zen::kTrace_Fatal) diff --git a/zencore/include/zencore/uid.h b/zencore/include/zencore/uid.h new file mode 100644 index 000000000..a793b160a --- /dev/null +++ b/zencore/include/zencore/uid.h @@ -0,0 +1,78 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> +#include <compare> + +namespace zen { + +class StringBuilderBase; + +/** Object identifier + + Can be used as a GUID essentially, but is more compact (12 bytes) and as such + is more susceptible to collisions than a 16-byte GUID but also I don't expect + the population to be large so in practice the risk should be minimal due to + how the identifiers work. + + Similar in spirit to MongoDB ObjectId + + When serialized, object identifiers generated in a given session in sequence + will sort in chronological order since the timestamp is in the MSB in big + endian format. This makes it suitable as a database key since most indexing + structures work better when keys are inserted in lexicographically + increasing order. + + The current layout is basically: + + |----------------|----------------|----------------| + | timestamp | serial # | run id | + |----------------|----------------|----------------| + MSB LSB + + - Timestamp is a unsigned 32-bit value (seconds since Jan 1 1970) + - Serial # is another unsigned 32-bit value which is assigned a (strong) + random number at initialization time which is incremented when a new Oid + is generated + - The run id is generated from a strong random number generator + at initialization time and stays fixed for the duration of the program + + Timestamp and serial are stored in memory in such a way that they can be + ordered lexicographically. I.e they are in big-endian byte order. + + */ + +struct Oid +{ + static const int StringLength = 24; + typedef char String_t[StringLength + 1]; + + static void Initialize(); + [[nodiscard]] static Oid NewOid(); + + const Oid& Generate(); + [[nodiscard]] static Oid FromHexString(const std::string_view String); + StringBuilderBase& ToString(StringBuilderBase& OutString) const; + + auto operator<=>(const Oid& rhs) const = default; + + static const Oid Zero; // Min (can be used to signify a "null" value, or for open range queries) + static const Oid Max; // Max (can be used for open range queries) + + struct Hasher + { + size_t operator()(const Oid& id) const + { + const size_t seed = id.OidBits[0]; + return (seed << 6) + (seed >> 2) + 0x9e3779b9 + uint64_t(id.OidBits[1]) | (uint64_t(id.OidBits[2]) << 32); + } + }; + + // You should not assume anything about these words + uint32_t OidBits[3]; +}; + +extern void uid_forcelink(); + +} // namespace zen diff --git a/zencore/include/zencore/varint.h b/zencore/include/zencore/varint.h new file mode 100644 index 000000000..0c40dd66b --- /dev/null +++ b/zencore/include/zencore/varint.h @@ -0,0 +1,255 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "intmath.h" + +namespace zen { + +// Variable-Length Integer Encoding +// +// ZigZag encoding is used to convert signed integers into unsigned integers in a way that allows +// integers with a small magnitude to have a smaller encoded representation. +// +// An unsigned integer is encoded into 1-9 bytes based on its magnitude. The first byte indicates +// how many additional bytes are used by the number of leading 1-bits that it has. The additional +// bytes are stored in big endian order, and the most significant bits of the value are stored in +// the remaining bits in the first byte. The encoding of the first byte allows the reader to skip +// over the encoded integer without consuming its bytes individually. +// +// Encoded unsigned integers sort the same in a byte-wise comparison as when their decoded values +// are compared. The same property does not hold for signed integers due to ZigZag encoding. +// +// 32-bit inputs encode to 1-5 bytes. +// 64-bit inputs encode to 1-9 bytes. +// +// 0x0000'0000'0000'0000 - 0x0000'0000'0000'007f : 0b0_______ 1 byte +// 0x0000'0000'0000'0080 - 0x0000'0000'0000'3fff : 0b10______ 2 bytes +// 0x0000'0000'0000'4000 - 0x0000'0000'001f'ffff : 0b110_____ 3 bytes +// 0x0000'0000'0020'0000 - 0x0000'0000'0fff'ffff : 0b1110____ 4 bytes +// 0x0000'0000'1000'0000 - 0x0000'0007'ffff'ffff : 0b11110___ 5 bytes +// 0x0000'0008'0000'0000 - 0x0000'03ff'ffff'ffff : 0b111110__ 6 bytes +// 0x0000'0400'0000'0000 - 0x0001'ffff'ffff'ffff : 0b1111110_ 7 bytes +// 0x0002'0000'0000'0000 - 0x00ff'ffff'ffff'ffff : 0b11111110 8 bytes +// 0x0100'0000'0000'0000 - 0xffff'ffff'ffff'ffff : 0b11111111 9 bytes +// +// Encoding Examples +// -42 => ZigZag => 0x53 => 0x53 +// 42 => ZigZag => 0x54 => 0x54 +// 0x1 => 0x01 +// 0x12 => 0x12 +// 0x123 => 0x81 0x23 +// 0x1234 => 0x92 0x34 +// 0x12345 => 0xc1 0x23 0x45 +// 0x123456 => 0xd2 0x34 0x56 +// 0x1234567 => 0xe1 0x23 0x45 0x67 +// 0x12345678 => 0xf0 0x12 0x34 0x56 0x78 +// 0x123456789 => 0xf1 0x23 0x45 0x67 0x89 +// 0x123456789a => 0xf8 0x12 0x34 0x56 0x78 0x9a +// 0x123456789ab => 0xfb 0x23 0x45 0x67 0x89 0xab +// 0x123456789abc => 0xfc 0x12 0x34 0x56 0x78 0x9a 0xbc +// 0x123456789abcd => 0xfd 0x23 0x45 0x67 0x89 0xab 0xcd +// 0x123456789abcde => 0xfe 0x12 0x34 0x56 0x78 0x9a 0xbc 0xde +// 0x123456789abcdef => 0xff 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef +// 0x123456789abcdef0 => 0xff 0x12 0x34 0x56 0x78 0x9a 0xbc 0xde 0xf0 + +/** + * Measure the length in bytes (1-9) of an encoded variable-length integer. + * + * @param InData A variable-length encoding of an (signed or unsigned) integer. + * @return The number of bytes used to encode the integer, in the range 1-9. + */ +inline uint32_t +MeasureVarUInt(const void* InData) +{ + return CountLeadingZeros(uint8_t(~*static_cast<const uint8_t*>(InData))) - 23; +} + +/** Measure the length in bytes (1-9) of an encoded variable-length integer. \see \ref MeasureVarUInt */ +inline uint32_t +MeasureVarInt(const void* InData) +{ + return MeasureVarUInt(InData); +} + +/** Measure the number of bytes (1-5) required to encode the 32-bit input. */ +inline uint32_t +MeasureVarUInt(uint32_t InValue) +{ + return uint32_t(int32_t(FloorLog2(InValue)) / 7 + 1); +} + +/** Measure the number of bytes (1-9) required to encode the 64-bit input. */ +inline uint32_t +MeasureVarUInt(uint64_t InValue) +{ + return uint32_t(std::min(int32_t(FloorLog2_64(InValue)) / 7 + 1, 9)); +} + +/** Measure the number of bytes (1-5) required to encode the 32-bit input. \see \ref MeasureVarUInt */ +inline uint32_t +MeasureVarInt(int32_t InValue) +{ + return MeasureVarUInt(uint32_t((InValue >> 31) ^ (InValue << 1))); +} + +/** Measure the number of bytes (1-9) required to encode the 64-bit input. \see \ref MeasureVarUInt */ +inline uint32_t +MeasureVarInt(int64_t InValue) +{ + return MeasureVarUInt(uint64_t((InValue >> 63) ^ (InValue << 1))); +} + +/** + * Read a variable-length unsigned integer. + * + * @param InData A variable-length encoding of an unsigned integer. + * @param OutByteCount The number of bytes consumed from the input. + * @return An unsigned integer. + */ +inline uint64_t +ReadVarUInt(const void* InData, uint32_t& OutByteCount) +{ + const uint32_t ByteCount = MeasureVarUInt(InData); + OutByteCount = ByteCount; + + const uint8_t* InBytes = static_cast<const uint8_t*>(InData); + uint64_t Value = *InBytes++ & uint8_t(0xff >> ByteCount); + switch (ByteCount - 1) + { + case 8: + Value <<= 8; + Value |= *InBytes++; + case 7: + Value <<= 8; + Value |= *InBytes++; + case 6: + Value <<= 8; + Value |= *InBytes++; + case 5: + Value <<= 8; + Value |= *InBytes++; + case 4: + Value <<= 8; + Value |= *InBytes++; + case 3: + Value <<= 8; + Value |= *InBytes++; + case 2: + Value <<= 8; + Value |= *InBytes++; + case 1: + Value <<= 8; + Value |= *InBytes++; + default: + return Value; + } +} + +/** + * Read a variable-length signed integer. + * + * @param InData A variable-length encoding of a signed integer. + * @param OutByteCount The number of bytes consumed from the input. + * @return A signed integer. + */ +inline int64_t +ReadVarInt(const void* InData, uint32_t& OutByteCount) +{ + const uint64_t Value = ReadVarUInt(InData, OutByteCount); + return -int64_t(Value & 1) ^ int64_t(Value >> 1); +} + +/** + * Write a variable-length unsigned integer. + * + * @param InValue An unsigned integer to encode. + * @param OutData A buffer of at least 5 bytes to write the output to. + * @return The number of bytes used in the output. + */ +inline uint32_t +WriteVarUInt(uint32_t InValue, void* OutData) +{ + const uint32_t ByteCount = MeasureVarUInt(InValue); + uint8_t* OutBytes = static_cast<uint8_t*>(OutData) + ByteCount - 1; + switch (ByteCount - 1) + { + case 4: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 3: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 2: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 1: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + default: + break; + } + *OutBytes = uint8_t(0xff << (9 - ByteCount)) | uint8_t(InValue); + return ByteCount; +} + +/** + * Write a variable-length unsigned integer. + * + * @param InValue An unsigned integer to encode. + * @param OutData A buffer of at least 9 bytes to write the output to. + * @return The number of bytes used in the output. + */ +inline uint32_t +WriteVarUInt(uint64_t InValue, void* OutData) +{ + const uint32_t ByteCount = MeasureVarUInt(InValue); + uint8_t* OutBytes = static_cast<uint8_t*>(OutData) + ByteCount - 1; + switch (ByteCount - 1) + { + case 8: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 7: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 6: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 5: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 4: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 3: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 2: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + case 1: + *OutBytes-- = uint8_t(InValue); + InValue >>= 8; + default: + break; + } + *OutBytes = uint8_t(0xff << (9 - ByteCount)) | uint8_t(InValue); + return ByteCount; +} + +/** Write a variable-length signed integer. \see \ref WriteVarUInt */ +inline uint32_t +WriteVarInt(int32_t InValue, void* OutData) +{ + const uint32_t Value = uint32_t((InValue >> 31) ^ (InValue << 1)); + return WriteVarUInt(Value, OutData); +} + +/** Write a variable-length signed integer. \see \ref WriteVarUInt */ +inline uint32_t +WriteVarInt(int64_t InValue, void* OutData) +{ + const uint64_t Value = uint64_t((InValue >> 63) ^ (InValue << 1)); + return WriteVarUInt(Value, OutData); +} + +} // namespace zen diff --git a/zencore/include/zencore/windows.h b/zencore/include/zencore/windows.h new file mode 100644 index 000000000..8888bf757 --- /dev/null +++ b/zencore/include/zencore/windows.h @@ -0,0 +1,10 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +struct IUnknown; // Workaround for "combaseapi.h(229): error C2187: syntax error: 'identifier' was unexpected here" when using /permissive- +#ifndef NOMINMAX +# define NOMINMAX // We don't want your min/max macros +#endif +#define WIN32_LEAN_AND_MEAN +#include <windows.h> diff --git a/zencore/include/zencore/xxhash.h b/zencore/include/zencore/xxhash.h new file mode 100644 index 000000000..5407755df --- /dev/null +++ b/zencore/include/zencore/xxhash.h @@ -0,0 +1,87 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "zencore.h" + +#include <zencore/memory.h> + +#include <xxh3.h> +#include <string_view> + +namespace zen { + +class StringBuilderBase; + +/** + * XXH3 hash + */ +struct XXH3_128 +{ + uint8_t Hash[16]; + + static XXH3_128 MakeFrom(const void* data /* 16 bytes */) + { + XXH3_128 Xx; + memcpy(Xx.Hash, data, sizeof Xx); + return Xx; + } + + static inline XXH3_128 HashMemory(const void* data, size_t byteCount) + { + XXH3_128 Hash; + XXH128_canonicalFromHash((XXH128_canonical_t*)Hash.Hash, XXH3_128bits(data, byteCount)); + return Hash; + } + static XXH3_128 HashMemory(MemoryView Data) { return HashMemory(Data.GetData(), Data.GetSize()); } + static XXH3_128 FromHexString(const char* string); + static XXH3_128 FromHexString(const std::string_view string); + const char* ToHexString(char* outString /* 32 characters + NUL terminator */) const; + StringBuilderBase& ToHexString(StringBuilderBase& outBuilder) const; + + static const int StringLength = 32; + typedef char String_t[StringLength + 1]; + + static XXH3_128 Zero; // Initialized to all zeros + + inline auto operator<=>(const XXH3_128& rhs) const = default; + + struct Hasher + { + size_t operator()(const XXH3_128& v) const + { + size_t h; + memcpy(&h, v.Hash, sizeof h); + return h; + } + }; +}; + +struct XXH3_128Stream +{ + /// Begin streaming hash compute (not needed on freshly constructed instance) + void Reset() { memset(&m_State, 0, sizeof m_State); } + + /// Append another chunk + XXH3_128Stream& Append(const void* Data, size_t ByteCount) + { + XXH3_128bits_update(&m_State, Data, ByteCount); + return *this; + } + + /// Append another chunk + XXH3_128Stream& Append(MemoryView Data) { return Append(Data.GetData(), Data.GetSize()); } + + /// Obtain final hash. If you wish to reuse the instance call reset() + XXH3_128 GetHash() + { + XXH3_128 Hash; + XXH128_canonicalFromHash((XXH128_canonical_t*)Hash.Hash, XXH3_128bits_digest(&m_State)); + return Hash; + } + +private: + XXH3_state_s m_State{}; +}; + +} // namespace zen diff --git a/zencore/include/zencore/zencore.h b/zencore/include/zencore/zencore.h new file mode 100644 index 000000000..4a448776b --- /dev/null +++ b/zencore/include/zencore/zencore.h @@ -0,0 +1,134 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <cinttypes> +#include <exception> +#include <string> + +////////////////////////////////////////////////////////////////////////// +// Platform +// + +#define ZEN_PLATFORM_WINDOWS 1 +#define ZEN_PLATFORM_LINUX 0 +#define ZEN_PLATFORM_MACOS 0 + +////////////////////////////////////////////////////////////////////////// +// Compiler +// + +#ifdef _MSC_VER +# define ZEN_COMPILER_MSC 1 +#endif + +#ifndef ZEN_COMPILER_MSC +# define ZEN_COMPILER_MSC 0 +#endif + +#ifndef ZEN_COMPILER_CLANG +# define ZEN_COMPILER_CLANG 0 +#endif + +////////////////////////////////////////////////////////////////////////// +// Build flavor +// + +#ifdef NDEBUG +# define ZEN_BUILD_DEBUG 0 +# define ZEN_BUILD_RELEASE 1 +#else +# define ZEN_BUILD_DEBUG 1 +# define ZEN_BUILD_RELEASE 0 +#endif + +////////////////////////////////////////////////////////////////////////// + +#define ZEN_PLATFORM_SUPPORTS_UNALIGNED_LOADS 1 + +////////////////////////////////////////////////////////////////////////// +// Assert +// + +namespace zen { + +class AssertException : public std::exception +{ +public: + AssertException(const char* Msg) : m_Msg(Msg) {} + + [[nodiscard]] virtual char const* what() const override { return m_Msg.c_str(); } + +private: + std::string m_Msg; +}; + +} // namespace zen + +#define ZEN_ASSERT(x, ...) \ + do \ + { \ + if (x) \ + break; \ + throw ::zen::AssertException{#x}; \ + } while (false) + +#ifndef NDEBUG +# define ZEN_ASSERT_SLOW(x, ...) \ + do \ + { \ + if (x) \ + break; \ + throw ::zen::AssertException{#x}; \ + } while (false) +#else +# define ZEN_ASSERT_SLOW(x, ...) +#endif + +////////////////////////////////////////////////////////////////////////// + +#ifdef __clang__ +template<typename T> +auto ZenArrayCountHelper(T& t) -> typename std::enable_if<__is_array(T), char (&)[sizeof(t) / sizeof(t[0]) + 1]>::Type; +#else +template<typename T, uint32_t N> +char (&ZenArrayCountHelper(const T (&)[N]))[N + 1]; +#endif + +#define ZEN_ARRAY_COUNT(array) (sizeof(ZenArrayCountHelper(array)) - 1) + +////////////////////////////////////////////////////////////////////////// + +#define ZEN_NOINLINE __declspec(noinline) +#define ZEN_UNUSED(...) ((void)__VA_ARGS__) +#define ZEN_NOT_IMPLEMENTED(...) ZEN_ASSERT(false) +#define ZENCORE_API // Placeholder to allow DLL configs in the future + +ZENCORE_API bool IsPointerToStack(const void* ptr); // Query if pointer is within the stack of the currently executing thread +ZENCORE_API bool IsApplicationExitRequested(); +ZENCORE_API void RequestApplicationExit(int ExitCode); + +ZENCORE_API void zencore_forcelinktests(); + +////////////////////////////////////////////////////////////////////////// + +#if ZEN_COMPILER_MSC +# define ZEN_DISABLE_OPTIMIZATION_ACTUAL __pragma(optimize("", off)) +# define ZEN_ENABLE_OPTIMIZATION_ACTUAL __pragma(optimize("", on)) +#else +#endif + +// Set up optimization control macros, now that we have both the build settings and the platform macros +#define ZEN_DISABLE_OPTIMIZATION ZEN_DISABLE_OPTIMIZATION_ACTUAL + +#if ZEN_BUILD_DEBUG +# define ZEN_ENABLE_OPTIMIZATION ZEN_DISABLE_OPTIMIZATION_ACTUAL +#else +# define ZEN_ENABLE_OPTIMIZATION ZEN_ENABLE_OPTIMIZATION_ACTUAL +#endif + +#define ZEN_ENABLE_OPTIMIZATION_ALWAYS ZEN_ENABLE_OPTIMIZATION_ACTUAL + +////////////////////////////////////////////////////////////////////////// + +using ThreadId_t = uint32_t; diff --git a/zencore/iobuffer.cpp b/zencore/iobuffer.cpp new file mode 100644 index 000000000..a42dd83f4 --- /dev/null +++ b/zencore/iobuffer.cpp @@ -0,0 +1,341 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/iobuffer.h> + +#include <doctest/doctest.h> +#include <memory.h> +#include <zencore/memory.h> +#include <zencore/thread.h> +#include <system_error> + +#include <atlfile.h> +#include <spdlog/spdlog.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +void* +IoBufferCore::AllocateBuffer(size_t InSize, size_t Alignment) +{ + if (((InSize & 0xffFF) == 0) && (Alignment == 0x10000)) + { + m_Flags |= kLowLevelAlloc; + return VirtualAlloc(nullptr, InSize, MEM_COMMIT, PAGE_READWRITE); + } + else + { + return Memory::Alloc(InSize, Alignment); + } +} + +void +IoBufferCore::FreeBuffer() +{ + if (m_Flags & kLowLevelAlloc) + { + VirtualFree(const_cast<void*>(m_DataPtr), 0, MEM_DECOMMIT); + } + else + { + return Memory::Free(const_cast<void*>(m_DataPtr)); + } +} + +////////////////////////////////////////////////////////////////////////// + +IoBufferCore::IoBufferCore(size_t InSize) +{ + static_assert(sizeof(IoBufferCore) == 32); + + m_DataPtr = AllocateBuffer(InSize, sizeof(void*)); + m_DataBytes = InSize; + + SetIsOwned(true); +} + +IoBufferCore::IoBufferCore(size_t InSize, size_t Alignment) +{ + m_DataPtr = AllocateBuffer(InSize, Alignment); + m_DataBytes = InSize; + + SetIsOwned(true); +} + +IoBufferCore::~IoBufferCore() +{ + if (IsOwned() && m_DataPtr) + { + FreeBuffer(); + m_DataPtr = nullptr; + } +} + +void +IoBufferCore::DeleteThis() const +{ + // We do this just to avoid paying for the cost of a vtable + if (const IoBufferExtendedCore* _ = ExtendedCore()) + { + delete _; + } + else + { + delete this; + } +} + +void +IoBufferCore::Materialize() const +{ + if (const IoBufferExtendedCore* _ = ExtendedCore()) + { + _->Materialize(); + } +} + +void +IoBufferCore::MakeOwned(bool Immutable) +{ + if (!IsOwned()) + { + void* OwnedDataPtr = AllocateBuffer(m_DataBytes, sizeof(void*)); + memcpy(OwnedDataPtr, m_DataPtr, m_DataBytes); + + m_DataPtr = OwnedDataPtr; + SetIsOwned(true); + } + + SetIsImmutable(Immutable); +} + +////////////////////////////////////////////////////////////////////////// + +IoBufferExtendedCore::IoBufferExtendedCore(void* FileHandle, uint64_t Offset, uint64_t Size, bool TransferHandleOwnership) +: IoBufferCore(nullptr, Size) +, m_FileHandle(FileHandle) +, m_FileOffset(Offset) +{ + m_Flags |= kIsOwned | kIsExtended; + + if (TransferHandleOwnership) + { + m_Flags |= kOwnsFile; + } +} + +IoBufferExtendedCore::IoBufferExtendedCore(const IoBufferExtendedCore* Outer, uint64_t Offset, uint64_t Size) +: IoBufferCore(Outer, nullptr, Size) +, m_FileHandle(Outer->m_FileHandle) +, m_FileOffset(Outer->m_FileOffset + Offset) +{ + m_Flags |= kIsOwned | kIsExtended; +} + +IoBufferExtendedCore::~IoBufferExtendedCore() +{ + if (m_MappedPointer) + { + UnmapViewOfFile(m_MappedPointer); + } + + if (m_Flags & kOwnsMmap) + { + CloseHandle(m_MmapHandle); + } + + if (m_Flags & kOwnsFile) + { + BOOL Success = CloseHandle(m_FileHandle); + + if (!Success) + { + spdlog::warn("Error reported on file handle close!"); + } + } + + m_DataPtr = nullptr; +} + +RwLock g_MappingLock; + +void +IoBufferExtendedCore::Materialize() const +{ + // The synchronization scheme here is very primitive, if we end up with + // a lot of contention we can make it more fine-grained + + if (m_MmapHandle) + return; + + RwLock::ExclusiveLockScope _(g_MappingLock); + + // Someone could have gotten here first + if (m_MmapHandle) + return; + + m_MmapHandle = CreateFileMapping(m_FileHandle, + /* lpFileMappingAttributes */ nullptr, + /* flProtect */ PAGE_READONLY, + /* dwMaximumSizeLow */ 0, + /* dwMaximumSizeHigh */ 0, + /* lpName */ nullptr); + + if (m_MmapHandle == nullptr) + { + throw std::system_error(std::error_code(::GetLastError(), std::system_category()), "file copy failed"); + } + + m_Flags |= kOwnsMmap; + + const uint64_t MapOffset = m_FileOffset & ~0xffffull; + const uint64_t MappedOffsetDisplacement = m_FileOffset - MapOffset; + const uint64_t MapSize = (MappedOffsetDisplacement + m_DataBytes + 0xffffu) & ~0xffffull; + + void* MappedBase = MapViewOfFile(m_MmapHandle, + /* dwDesiredAccess */ FILE_MAP_READ, + /* FileOffsetHigh */ uint32_t(MapOffset >> 32), + /* FileOffsetLow */ uint32_t(MapOffset & 0xffFFffFFu), + /* dwNumberOfBytesToMap */ m_DataBytes); + + if (MappedBase == nullptr) + { + throw std::system_error(std::error_code(::GetLastError(), std::system_category()), "MapViewOfFile failed"); + } + + m_MappedPointer = MappedBase; + m_DataPtr = reinterpret_cast<uint8_t*>(MappedBase) + MappedOffsetDisplacement; + + m_Flags |= kIsMaterialized; +} + +bool +IoBufferExtendedCore::GetFileReference(IoBufferFileReference& OutRef) const +{ + if (m_FileHandle == nullptr) + { + return false; + } + + OutRef.FileHandle = m_FileHandle; + OutRef.FileChunkOffset = m_FileOffset; + OutRef.FileChunkSize = m_DataBytes; + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +IoBuffer::IoBuffer(size_t InSize) : m_Core(new IoBufferCore(InSize)) +{ +} + +IoBuffer::IoBuffer(size_t InSize, uint64_t InAlignment) : m_Core(new IoBufferCore(InSize, InAlignment)) +{ +} + +IoBuffer::IoBuffer(const IoBuffer& OuterBuffer, size_t Offset, size_t Size) +{ + if (Size == ~(0ull)) + { + Size = std::clamp<size_t>(Size, 0, OuterBuffer.Size() - Offset); + } + + ZEN_ASSERT(Offset <= OuterBuffer.Size()); + ZEN_ASSERT((Offset + Size) <= OuterBuffer.Size()); + + if (IoBufferExtendedCore* Extended = OuterBuffer.m_Core->ExtendedCore()) + { + m_Core = new IoBufferExtendedCore(Extended, Offset, Size); + } + else + { + m_Core = new IoBufferCore(OuterBuffer.m_Core, reinterpret_cast<const uint8_t*>(OuterBuffer.Data()) + Offset, Size); + } +} + +IoBuffer::IoBuffer(EFileTag, void* FileHandle, uint64_t ChunkFileOffset, uint64_t ChunkSize) +: m_Core(new IoBufferExtendedCore(FileHandle, ChunkFileOffset, ChunkSize, /* owned */ true)) +{ +} + +IoBuffer::IoBuffer(EBorrowedFileTag, void* FileHandle, uint64_t ChunkFileOffset, uint64_t ChunkSize) +: m_Core(new IoBufferExtendedCore(FileHandle, ChunkFileOffset, ChunkSize, /* owned */ false)) +{ +} + +bool +IoBuffer::GetFileReference(IoBufferFileReference& OutRef) const +{ + if (IoBufferExtendedCore* ExtCore = m_Core->ExtendedCore()) + { + if (ExtCore->GetFileReference(OutRef)) + { + return true; + } + } + + // Not a file reference + + OutRef.FileHandle = 0; + OutRef.FileChunkOffset = ~0ull; + OutRef.FileChunkSize = 0; + + return false; +} + +////////////////////////////////////////////////////////////////////////// + +IoBuffer +IoBufferBuilder::MakeFromFileHandle(void* FileHandle, uint64_t Offset, uint64_t Size) +{ + return IoBuffer(IoBuffer::BorrowedFile, FileHandle, Offset, Size); +} + +IoBuffer +IoBufferBuilder::MakeFromFile(const wchar_t* FileName, uint64_t Offset, uint64_t Size) +{ + CAtlFile DataFile; + + HRESULT hRes = DataFile.Create(FileName, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + + if (SUCCEEDED(hRes)) + { + ULONGLONG FileSize; + DataFile.GetSize(FileSize); + + if (Size == ~0ull) + { + Size = FileSize; + } + else + { + // Clamp size + if ((Offset + Size) > FileSize) + { + Size = FileSize - Offset; + } + } + + return IoBuffer(IoBuffer::File, DataFile.Detach(), Offset, Size); + } + + return {}; +} + +////////////////////////////////////////////////////////////////////////// + +void +iobuffer_forcelink() +{ +} + +TEST_CASE("IoBuffer") +{ + zen::IoBuffer buffer1; + zen::IoBuffer buffer2(16384); + zen::IoBuffer buffer3(buffer2, 0, buffer2.Size()); +} + +} // namespace zen diff --git a/zencore/iohash.cpp b/zencore/iohash.cpp new file mode 100644 index 000000000..afe2e54ba --- /dev/null +++ b/zencore/iohash.cpp @@ -0,0 +1,73 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/iohash.h> + +#include <zencore/blake3.h> +#include <zencore/string.h> + +#include <doctest/doctest.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +IoHash IoHash::Zero; // Initialized to all zeros + +IoHash +IoHash::HashMemory(const void* data, size_t byteCount) +{ + BLAKE3 b3 = BLAKE3::HashMemory(data, byteCount); + + IoHash io; + memcpy(io.Hash, b3.Hash, sizeof io.Hash); + + return io; +} + +IoHash +IoHash::FromHexString(const char* string) +{ + return FromHexString({string, sizeof(IoHash::Hash) * 2}); +} + +IoHash +IoHash::FromHexString(std::string_view string) +{ + ZEN_ASSERT(string.size() == 2 * sizeof(IoHash::Hash)); + + IoHash io; + + ParseHexBytes(string.data(), string.size(), io.Hash); + + return io; +} + +const char* +IoHash::ToHexString(char* outString /* 40 characters + NUL terminator */) const +{ + ToHexBytes(Hash, sizeof(IoHash), outString); + outString[2 * sizeof(IoHash)] = '\0'; + + return outString; +} + +StringBuilderBase& +IoHash::ToHexString(StringBuilderBase& outBuilder) const +{ + String_t Str; + ToHexString(Str); + + outBuilder.AppendRange(Str, &Str[StringLength]); + + return outBuilder; +} + +std::string +IoHash::ToHexString() const +{ + String_t Str; + ToHexString(Str); + + return Str; +} + +} // namespace zen diff --git a/zencore/iothreadpool.cpp b/zencore/iothreadpool.cpp new file mode 100644 index 000000000..4ed81d7a2 --- /dev/null +++ b/zencore/iothreadpool.cpp @@ -0,0 +1,36 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "iothreadpool.h" + +namespace zen { + +WinIoThreadPool::WinIoThreadPool(int InThreadCount) +{ + // Thread pool setup + + m_ThreadPool = CreateThreadpool(NULL); + + SetThreadpoolThreadMinimum(m_ThreadPool, InThreadCount); + SetThreadpoolThreadMaximum(m_ThreadPool, InThreadCount * 2); + + InitializeThreadpoolEnvironment(&m_CallbackEnvironment); + + m_CleanupGroup = CreateThreadpoolCleanupGroup(); + + SetThreadpoolCallbackPool(&m_CallbackEnvironment, m_ThreadPool); + + SetThreadpoolCallbackCleanupGroup(&m_CallbackEnvironment, m_CleanupGroup, NULL); +} + +WinIoThreadPool::~WinIoThreadPool() +{ + CloseThreadpool(m_ThreadPool); +} + +void +WinIoThreadPool::CreateIocp(HANDLE IoHandle, PTP_WIN32_IO_CALLBACK Callback, void* Context) +{ + m_ThreadPoolIo = CreateThreadpoolIo(IoHandle, Callback, Context, &m_CallbackEnvironment); +} + +} // namespace zen diff --git a/zencore/iothreadpool.h b/zencore/iothreadpool.h new file mode 100644 index 000000000..f64868540 --- /dev/null +++ b/zencore/iothreadpool.h @@ -0,0 +1,31 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/windows.h> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// +// +// Thread pool. Implemented in terms of Windows thread pool right now, will +// need a cross-platform implementation eventually +// + +class WinIoThreadPool +{ +public: + WinIoThreadPool(int InThreadCount); + ~WinIoThreadPool(); + + void CreateIocp(HANDLE IoHandle, PTP_WIN32_IO_CALLBACK Callback, void* Context); + inline PTP_IO Iocp() const { return m_ThreadPoolIo; } + +private: + PTP_POOL m_ThreadPool = nullptr; + PTP_CLEANUP_GROUP m_CleanupGroup = nullptr; + PTP_IO m_ThreadPoolIo = nullptr; + TP_CALLBACK_ENVIRON m_CallbackEnvironment; +}; + +} // namespace zen diff --git a/zencore/md5.cpp b/zencore/md5.cpp new file mode 100644 index 000000000..228c0feff --- /dev/null +++ b/zencore/md5.cpp @@ -0,0 +1,446 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/md5.h> +#include <zencore/string.h> +#include <zencore/zencore.h> + +#include <doctest/doctest.h> +#include <string.h> + +// big endian architectures need #define __BYTE_ORDER __BIG_ENDIAN +#ifndef _MSC_VER +# include <endian.h> +#endif + +/* + ********************************************************************** + ** md5.h -- Header file for implementation of MD5 ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version ** + ** Revised (for MD5): RLR 4/27/91 ** + ** -- G modified to have y&~z instead of y&z ** + ** -- FF, GG, HH modified to add in last register done ** + ** -- Access pattern: round 2 works mod 5, round 3 works mod 3 ** + ** -- distinct additive constant for each step ** + ** -- round 4 added, working mod 7 ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +/* Data structure for MD5 (Message Digest) computation */ +struct MD5_CTX +{ + uint32_t i[2]; /* number of _bits_ handled mod 2^64 */ + uint32_t buf[4]; /* scratch buffer */ + unsigned char in[64]; /* input buffer */ + unsigned char digest[16]; /* actual digest after MD5Final call */ +}; + +void MD5Init(); +void MD5Update(); +void MD5Final(); + +/* + ********************************************************************** + ** End of md5.h ** + ******************************* (cut) ******************************** + */ + +/* + ********************************************************************** + ** md5.c ** + ** RSA Data Security, Inc. MD5 Message Digest Algorithm ** + ** Created: 2/17/90 RLR ** + ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version ** + ********************************************************************** + */ + +/* + ********************************************************************** + ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. ** + ** ** + ** License to copy and use this software is granted provided that ** + ** it is identified as the "RSA Data Security, Inc. MD5 Message ** + ** Digest Algorithm" in all material mentioning or referencing this ** + ** software or this function. ** + ** ** + ** License is also granted to make and use derivative works ** + ** provided that such works are identified as "derived from the RSA ** + ** Data Security, Inc. MD5 Message Digest Algorithm" in all ** + ** material mentioning or referencing the derived work. ** + ** ** + ** RSA Data Security, Inc. makes no representations concerning ** + ** either the merchantability of this software or the suitability ** + ** of this software for any particular purpose. It is provided "as ** + ** is" without express or implied warranty of any kind. ** + ** ** + ** These notices must be retained in any copies of any part of this ** + ** documentation and/or software. ** + ********************************************************************** + */ + +/* -- include the following line if the md5.h header file is separate -- */ +/* #include "md5.h" */ + +/* forward declaration */ +static void Transform(uint32_t* buf, uint32_t* in); + +static unsigned char PADDING[64] = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +/* F, G and H are basic MD5 functions: selection, majority, parity */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */ +/* Rotation is separate from addition to prevent recomputation */ +#define FF(a, b, c, d, x, s, ac) \ + { \ + (a) += F((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) \ + { \ + (a) += G((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) \ + { \ + (a) += H((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) \ + { \ + (a) += I((b), (c), (d)) + (x) + (uint32_t)(ac); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } + +void +MD5Init(MD5_CTX* mdContext) +{ + mdContext->i[0] = mdContext->i[1] = (uint32_t)0; + + /* Load magic initialization constants. + */ + mdContext->buf[0] = (uint32_t)0x67452301; + mdContext->buf[1] = (uint32_t)0xefcdab89; + mdContext->buf[2] = (uint32_t)0x98badcfe; + mdContext->buf[3] = (uint32_t)0x10325476; +} + +void +MD5Update(MD5_CTX* mdContext, unsigned char* inBuf, unsigned int inLen) +{ + uint32_t in[16]; + int mdi; + unsigned int i, ii; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* update number of bits */ + if ((mdContext->i[0] + ((uint32_t)inLen << 3)) < mdContext->i[0]) + mdContext->i[1]++; + mdContext->i[0] += ((uint32_t)inLen << 3); + mdContext->i[1] += ((uint32_t)inLen >> 29); + + while (inLen--) + { + /* add new character to buffer, increment mdi */ + mdContext->in[mdi++] = *inBuf++; + + /* transform if necessary */ + if (mdi == 0x40) + { + for (i = 0, ii = 0; i < 16; i++, ii += 4) + in[i] = (((uint32_t)mdContext->in[ii + 3]) << 24) | (((uint32_t)mdContext->in[ii + 2]) << 16) | + (((uint32_t)mdContext->in[ii + 1]) << 8) | ((uint32_t)mdContext->in[ii]); + Transform(mdContext->buf, in); + mdi = 0; + } + } +} + +void +MD5Final(MD5_CTX* mdContext) +{ + uint32_t in[16]; + int mdi; + unsigned int i, ii; + unsigned int padLen; + + /* save number of bits */ + in[14] = mdContext->i[0]; + in[15] = mdContext->i[1]; + + /* compute number of bytes mod 64 */ + mdi = (int)((mdContext->i[0] >> 3) & 0x3F); + + /* pad out to 56 mod 64 */ + padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi); + MD5Update(mdContext, PADDING, padLen); + + /* append length in bits and transform */ + for (i = 0, ii = 0; i < 14; i++, ii += 4) + in[i] = (((uint32_t)mdContext->in[ii + 3]) << 24) | (((uint32_t)mdContext->in[ii + 2]) << 16) | + (((uint32_t)mdContext->in[ii + 1]) << 8) | ((uint32_t)mdContext->in[ii]); + Transform(mdContext->buf, in); + + /* store buffer in digest */ + for (i = 0, ii = 0; i < 4; i++, ii += 4) + { + mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF); + mdContext->digest[ii + 1] = (unsigned char)((mdContext->buf[i] >> 8) & 0xFF); + mdContext->digest[ii + 2] = (unsigned char)((mdContext->buf[i] >> 16) & 0xFF); + mdContext->digest[ii + 3] = (unsigned char)((mdContext->buf[i] >> 24) & 0xFF); + } +} + +/* Basic MD5 step. Transform buf based on in. + */ +static void +Transform(uint32_t* buf, uint32_t* in) +{ + uint32_t a = buf[0], b = buf[1], c = buf[2], d = buf[3]; + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF(a, b, c, d, in[0], S11, 3614090360); /* 1 */ + FF(d, a, b, c, in[1], S12, 3905402710); /* 2 */ + FF(c, d, a, b, in[2], S13, 606105819); /* 3 */ + FF(b, c, d, a, in[3], S14, 3250441966); /* 4 */ + FF(a, b, c, d, in[4], S11, 4118548399); /* 5 */ + FF(d, a, b, c, in[5], S12, 1200080426); /* 6 */ + FF(c, d, a, b, in[6], S13, 2821735955); /* 7 */ + FF(b, c, d, a, in[7], S14, 4249261313); /* 8 */ + FF(a, b, c, d, in[8], S11, 1770035416); /* 9 */ + FF(d, a, b, c, in[9], S12, 2336552879); /* 10 */ + FF(c, d, a, b, in[10], S13, 4294925233); /* 11 */ + FF(b, c, d, a, in[11], S14, 2304563134); /* 12 */ + FF(a, b, c, d, in[12], S11, 1804603682); /* 13 */ + FF(d, a, b, c, in[13], S12, 4254626195); /* 14 */ + FF(c, d, a, b, in[14], S13, 2792965006); /* 15 */ + FF(b, c, d, a, in[15], S14, 1236535329); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG(a, b, c, d, in[1], S21, 4129170786); /* 17 */ + GG(d, a, b, c, in[6], S22, 3225465664); /* 18 */ + GG(c, d, a, b, in[11], S23, 643717713); /* 19 */ + GG(b, c, d, a, in[0], S24, 3921069994); /* 20 */ + GG(a, b, c, d, in[5], S21, 3593408605); /* 21 */ + GG(d, a, b, c, in[10], S22, 38016083); /* 22 */ + GG(c, d, a, b, in[15], S23, 3634488961); /* 23 */ + GG(b, c, d, a, in[4], S24, 3889429448); /* 24 */ + GG(a, b, c, d, in[9], S21, 568446438); /* 25 */ + GG(d, a, b, c, in[14], S22, 3275163606); /* 26 */ + GG(c, d, a, b, in[3], S23, 4107603335); /* 27 */ + GG(b, c, d, a, in[8], S24, 1163531501); /* 28 */ + GG(a, b, c, d, in[13], S21, 2850285829); /* 29 */ + GG(d, a, b, c, in[2], S22, 4243563512); /* 30 */ + GG(c, d, a, b, in[7], S23, 1735328473); /* 31 */ + GG(b, c, d, a, in[12], S24, 2368359562); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH(a, b, c, d, in[5], S31, 4294588738); /* 33 */ + HH(d, a, b, c, in[8], S32, 2272392833); /* 34 */ + HH(c, d, a, b, in[11], S33, 1839030562); /* 35 */ + HH(b, c, d, a, in[14], S34, 4259657740); /* 36 */ + HH(a, b, c, d, in[1], S31, 2763975236); /* 37 */ + HH(d, a, b, c, in[4], S32, 1272893353); /* 38 */ + HH(c, d, a, b, in[7], S33, 4139469664); /* 39 */ + HH(b, c, d, a, in[10], S34, 3200236656); /* 40 */ + HH(a, b, c, d, in[13], S31, 681279174); /* 41 */ + HH(d, a, b, c, in[0], S32, 3936430074); /* 42 */ + HH(c, d, a, b, in[3], S33, 3572445317); /* 43 */ + HH(b, c, d, a, in[6], S34, 76029189); /* 44 */ + HH(a, b, c, d, in[9], S31, 3654602809); /* 45 */ + HH(d, a, b, c, in[12], S32, 3873151461); /* 46 */ + HH(c, d, a, b, in[15], S33, 530742520); /* 47 */ + HH(b, c, d, a, in[2], S34, 3299628645); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II(a, b, c, d, in[0], S41, 4096336452); /* 49 */ + II(d, a, b, c, in[7], S42, 1126891415); /* 50 */ + II(c, d, a, b, in[14], S43, 2878612391); /* 51 */ + II(b, c, d, a, in[5], S44, 4237533241); /* 52 */ + II(a, b, c, d, in[12], S41, 1700485571); /* 53 */ + II(d, a, b, c, in[3], S42, 2399980690); /* 54 */ + II(c, d, a, b, in[10], S43, 4293915773); /* 55 */ + II(b, c, d, a, in[1], S44, 2240044497); /* 56 */ + II(a, b, c, d, in[8], S41, 1873313359); /* 57 */ + II(d, a, b, c, in[15], S42, 4264355552); /* 58 */ + II(c, d, a, b, in[6], S43, 2734768916); /* 59 */ + II(b, c, d, a, in[13], S44, 1309151649); /* 60 */ + II(a, b, c, d, in[4], S41, 4149444226); /* 61 */ + II(d, a, b, c, in[11], S42, 3174756917); /* 62 */ + II(c, d, a, b, in[2], S43, 718787259); /* 63 */ + II(b, c, d, a, in[9], S44, 3951481745); /* 64 */ + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +/* + ********************************************************************** + ** End of md5.c ** + ******************************* (cut) ******************************** + */ + +#undef FF +#undef GG +#undef HH +#undef II +#undef F +#undef G +#undef H +#undef I + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +MD5 MD5::Zero; // Initialized to all zeroes + +////////////////////////////////////////////////////////////////////////// + +MD5Stream::MD5Stream() +{ + Reset(); +} + +void +MD5Stream::Reset() +{ +} + +MD5Stream& +MD5Stream::Append(const void* Data, size_t ByteCount) +{ + ZEN_UNUSED(Data); + ZEN_UNUSED(ByteCount); + + return *this; +} + +MD5 +MD5Stream::GetHash() +{ + MD5 md5{}; + + return md5; +} + +////////////////////////////////////////////////////////////////////////// + +MD5 +MD5::HashMemory(const void* data, size_t byteCount) +{ + return MD5Stream().Append(data, byteCount).GetHash(); +} + +MD5 +MD5::FromHexString(const char* string) +{ + MD5 md5; + + ParseHexBytes(string, 40, md5.Hash); + + return md5; +} + +const char* +MD5::ToHexString(char* outString /* 40 characters + NUL terminator */) const +{ + ToHexBytes(Hash, sizeof(MD5), outString); + outString[2 * sizeof(MD5)] = '\0'; + + return outString; +} + +StringBuilderBase& +MD5::ToHexString(StringBuilderBase& outBuilder) const +{ + char str[41]; + ToHexString(str); + + outBuilder.AppendRange(str, &str[40]); + + return outBuilder; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +md5_forcelink() +{ +} + +doctest::String +toString(const MD5& value) +{ + char md5text[2 * sizeof(MD5) + 1]; + value.ToHexString(md5text); + + return md5text; +} + +TEST_CASE("MD5") +{ +} + +} // namespace zen diff --git a/zencore/memory.cpp b/zencore/memory.cpp new file mode 100644 index 000000000..63d61f5e1 --- /dev/null +++ b/zencore/memory.cpp @@ -0,0 +1,165 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <malloc.h> +#include <zencore/intmath.h> +#include <zencore/memory.h> + +#include <doctest/doctest.h> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +MemoryArena::MemoryArena() +{ +} + +MemoryArena::~MemoryArena() +{ +} + +void* +MemoryArena::Alloc(size_t size, size_t alignment) +{ + return _mm_malloc(size, alignment); +} + +void +MemoryArena::Free(void* ptr) +{ + if (ptr) + _mm_free(ptr); +} + +////////////////////////////////////////////////////////////////////////// + +void* +Memory::Alloc(size_t size, size_t alignment) +{ + return _mm_malloc(size, alignment); +} + +void +Memory::Free(void* ptr) +{ + if (ptr) + _mm_free(ptr); +} + +////////////////////////////////////////////////////////////////////////// + +ChunkingLinearAllocator::ChunkingLinearAllocator(uint64_t ChunkSize, uint64_t ChunkAlignment) +: m_ChunkSize(ChunkSize) +, m_ChunkAlignment(ChunkAlignment) +{ +} + +ChunkingLinearAllocator::~ChunkingLinearAllocator() +{ + Reset(); +} + +void +ChunkingLinearAllocator::Reset() +{ + for (void* ChunkEntry : m_ChunkList) + { + Memory::Free(ChunkEntry); + } + m_ChunkList.clear(); + + m_ChunkCursor = nullptr; + m_ChunkBytesRemain = 0; +} + +void* +ChunkingLinearAllocator::Alloc(size_t Size, size_t Alignment) +{ + ZEN_ASSERT_SLOW(zen::IsPow2(Alignment)); + + // This could be improved in a bunch of ways + // + // * We pessimistically allocate memory even though there may be enough memory available for a single allocation due to the way we take + // alignment into account below + // * The block allocation size could be chosen to minimize slack for the case when multiple oversize allocations are made rather than + // minimizing the number of chunks + // * ... + + const uint64_t AllocationSize = zen::RoundUp(Size, Alignment); + + if (m_ChunkBytesRemain < (AllocationSize + Alignment - 1)) + { + const uint64_t ChunkSize = zen::RoundUp(zen::Max(m_ChunkSize, Size), m_ChunkSize); + void* ChunkPtr = Memory::Alloc(ChunkSize, m_ChunkAlignment); + m_ChunkCursor = reinterpret_cast<uint8_t*>(ChunkPtr); + m_ChunkBytesRemain = ChunkSize; + m_ChunkList.push_back(ChunkPtr); + } + + const uint64_t AlignFixup = (Alignment - reinterpret_cast<uintptr_t>(m_ChunkCursor)) & (Alignment - 1); + void* ReturnPtr = m_ChunkCursor + AlignFixup; + const uint64_t Delta = AlignFixup + AllocationSize; + + ZEN_ASSERT_SLOW(m_ChunkBytesRemain >= Delta); + + m_ChunkCursor += Delta; + m_ChunkBytesRemain -= Delta; + + ZEN_ASSERT_SLOW(IsPointerAligned(ReturnPtr, Alignment)); + + return ReturnPtr; +} + +////////////////////////////////////////////////////////////////////////// +// +// Unit tests +// + +TEST_CASE("ChunkingLinearAllocator") +{ + ChunkingLinearAllocator Allocator(4096); + + void* p1 = Allocator.Alloc(1, 1); + void* p2 = Allocator.Alloc(1, 1); + + CHECK(p1 != p2); + + void* p3 = Allocator.Alloc(1, 4); + CHECK(IsPointerAligned(p3, 4)); + + void* p3_2 = Allocator.Alloc(1, 4); + CHECK(IsPointerAligned(p3_2, 4)); + + void* p4 = Allocator.Alloc(1, 8); + CHECK(IsPointerAligned(p4, 8)); + + for (int i = 0; i < 100; ++i) + { + void* p0 = Allocator.Alloc(64); + ZEN_UNUSED(p0); + } +} + +TEST_CASE("MemoryView") +{ + { + uint8_t Array1[16]; + MemoryView View1 = MakeMemoryView(Array1); + CHECK(View1.GetSize() == 16); + } + + { + uint32_t Array2[16]; + MemoryView View2 = MakeMemoryView(Array2); + CHECK(View2.GetSize() == 64); + } + + CHECK(MakeMemoryView<float>({1.0f, 1.2f}).GetSize() == 8); +} + +void +memory_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/refcount.cpp b/zencore/refcount.cpp new file mode 100644 index 000000000..943635552 --- /dev/null +++ b/zencore/refcount.cpp @@ -0,0 +1,96 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/refcount.h> + +#include <doctest/doctest.h> +#include <functional> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +struct TestRefClass : public RefCounted +{ + ~TestRefClass() + { + if (OnDestroy) + OnDestroy(); + } + + using RefCounted::RefCount; + + std::function<void()> OnDestroy; +}; + +void +refcount_forcelink() +{ +} + +TEST_CASE("RefPtr") +{ + RefPtr<TestRefClass> Ref; + Ref = new TestRefClass; + + bool IsDestroyed = false; + Ref->OnDestroy = [&] { IsDestroyed = true; }; + + CHECK(IsDestroyed == false); + CHECK(Ref->RefCount() == 1); + + RefPtr<TestRefClass> Ref2; + Ref2 = Ref; + + CHECK(IsDestroyed == false); + CHECK(Ref->RefCount() == 2); + + RefPtr<TestRefClass> Ref3; + Ref2 = Ref3; + + CHECK(IsDestroyed == false); + CHECK(Ref->RefCount() == 1); + Ref = Ref3; + + CHECK(IsDestroyed == true); +} + +TEST_CASE("RefPtr on Stack allocated object") +{ + bool IsDestroyed = false; + + { + TestRefClass StackRefClass; + + StackRefClass.OnDestroy = [&] { IsDestroyed = true; }; + + CHECK(StackRefClass.RefCount() == 1); // Stack allocated objects should have +1 ref + + RefPtr<TestRefClass> Ref{&StackRefClass}; + + CHECK(IsDestroyed == false); + CHECK(StackRefClass.RefCount() == 2); + + RefPtr<TestRefClass> Ref2; + Ref2 = Ref; + + CHECK(IsDestroyed == false); + CHECK(StackRefClass.RefCount() == 3); + + RefPtr<TestRefClass> Ref3; + Ref2 = Ref3; + + CHECK(IsDestroyed == false); + CHECK(StackRefClass.RefCount() == 2); + + Ref = Ref3; + CHECK(IsDestroyed == false); + CHECK(StackRefClass.RefCount() == 1); + } + + CHECK(IsDestroyed == true); +} + +} // namespace zen diff --git a/zencore/sha1.cpp b/zencore/sha1.cpp new file mode 100644 index 000000000..3cc2f5cdf --- /dev/null +++ b/zencore/sha1.cpp @@ -0,0 +1,439 @@ +// ////////////////////////////////////////////////////////// +// sha1.cpp +// Copyright (c) 2014,2015 Stephan Brumme. All rights reserved. +// see http://create.stephan-brumme.com/disclaimer.html +// + +#include <zencore/sha1.h> +#include <zencore/string.h> +#include <zencore/zencore.h> + +#include <doctest/doctest.h> +#include <string.h> + +// big endian architectures need #define __BYTE_ORDER __BIG_ENDIAN +#ifndef _MSC_VER +# include <endian.h> +#endif + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +SHA1 SHA1::Zero; // Initialized to all zeroes + +////////////////////////////////////////////////////////////////////////// + +SHA1Stream::SHA1Stream() +{ + Reset(); +} + +void +SHA1Stream::Reset() +{ + m_NumBytes = 0; + m_BufferSize = 0; + + // according to RFC 1321 + m_Hash[0] = 0x67452301; + m_Hash[1] = 0xefcdab89; + m_Hash[2] = 0x98badcfe; + m_Hash[3] = 0x10325476; + m_Hash[4] = 0xc3d2e1f0; +} + +namespace { + // mix functions for processBlock() + inline uint32_t f1(uint32_t b, uint32_t c, uint32_t d) + { + return d ^ (b & (c ^ d)); // original: f = (b & c) | ((~b) & d); + } + + inline uint32_t f2(uint32_t b, uint32_t c, uint32_t d) { return b ^ c ^ d; } + + inline uint32_t f3(uint32_t b, uint32_t c, uint32_t d) { return (b & c) | (b & d) | (c & d); } + + inline uint32_t rotate(uint32_t a, uint32_t c) { return (a << c) | (a >> (32 - c)); } + + inline uint32_t swap(uint32_t x) + { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(x); +#endif +#ifdef MSC_VER + return _byteswap_ulong(x); +#endif + + return (x >> 24) | ((x >> 8) & 0x0000FF00) | ((x << 8) & 0x00FF0000) | (x << 24); + } +} // namespace + +/// process 64 bytes +void +SHA1Stream::ProcessBlock(const void* data) +{ + // get last hash + uint32_t a = m_Hash[0]; + uint32_t b = m_Hash[1]; + uint32_t c = m_Hash[2]; + uint32_t d = m_Hash[3]; + uint32_t e = m_Hash[4]; + + // data represented as 16x 32-bit words + const uint32_t* input = (uint32_t*)data; + // convert to big endian + uint32_t words[80]; + for (int i = 0; i < 16; i++) +#if defined(__BYTE_ORDER) && (__BYTE_ORDER != 0) && (__BYTE_ORDER == __BIG_ENDIAN) + words[i] = input[i]; +#else + words[i] = swap(input[i]); +#endif + + // extend to 80 words + for (int i = 16; i < 80; i++) + words[i] = rotate(words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16], 1); + + // first round + for (int i = 0; i < 4; i++) + { + int offset = 5 * i; + e += rotate(a, 5) + f1(b, c, d) + words[offset] + 0x5a827999; + b = rotate(b, 30); + d += rotate(e, 5) + f1(a, b, c) + words[offset + 1] + 0x5a827999; + a = rotate(a, 30); + c += rotate(d, 5) + f1(e, a, b) + words[offset + 2] + 0x5a827999; + e = rotate(e, 30); + b += rotate(c, 5) + f1(d, e, a) + words[offset + 3] + 0x5a827999; + d = rotate(d, 30); + a += rotate(b, 5) + f1(c, d, e) + words[offset + 4] + 0x5a827999; + c = rotate(c, 30); + } + + // second round + for (int i = 4; i < 8; i++) + { + int offset = 5 * i; + e += rotate(a, 5) + f2(b, c, d) + words[offset] + 0x6ed9eba1; + b = rotate(b, 30); + d += rotate(e, 5) + f2(a, b, c) + words[offset + 1] + 0x6ed9eba1; + a = rotate(a, 30); + c += rotate(d, 5) + f2(e, a, b) + words[offset + 2] + 0x6ed9eba1; + e = rotate(e, 30); + b += rotate(c, 5) + f2(d, e, a) + words[offset + 3] + 0x6ed9eba1; + d = rotate(d, 30); + a += rotate(b, 5) + f2(c, d, e) + words[offset + 4] + 0x6ed9eba1; + c = rotate(c, 30); + } + + // third round + for (int i = 8; i < 12; i++) + { + int offset = 5 * i; + e += rotate(a, 5) + f3(b, c, d) + words[offset] + 0x8f1bbcdc; + b = rotate(b, 30); + d += rotate(e, 5) + f3(a, b, c) + words[offset + 1] + 0x8f1bbcdc; + a = rotate(a, 30); + c += rotate(d, 5) + f3(e, a, b) + words[offset + 2] + 0x8f1bbcdc; + e = rotate(e, 30); + b += rotate(c, 5) + f3(d, e, a) + words[offset + 3] + 0x8f1bbcdc; + d = rotate(d, 30); + a += rotate(b, 5) + f3(c, d, e) + words[offset + 4] + 0x8f1bbcdc; + c = rotate(c, 30); + } + + // fourth round + for (int i = 12; i < 16; i++) + { + int offset = 5 * i; + e += rotate(a, 5) + f2(b, c, d) + words[offset] + 0xca62c1d6; + b = rotate(b, 30); + d += rotate(e, 5) + f2(a, b, c) + words[offset + 1] + 0xca62c1d6; + a = rotate(a, 30); + c += rotate(d, 5) + f2(e, a, b) + words[offset + 2] + 0xca62c1d6; + e = rotate(e, 30); + b += rotate(c, 5) + f2(d, e, a) + words[offset + 3] + 0xca62c1d6; + d = rotate(d, 30); + a += rotate(b, 5) + f2(c, d, e) + words[offset + 4] + 0xca62c1d6; + c = rotate(c, 30); + } + + // update hash + m_Hash[0] += a; + m_Hash[1] += b; + m_Hash[2] += c; + m_Hash[3] += d; + m_Hash[4] += e; +} + +/// add arbitrary number of bytes +SHA1Stream& +SHA1Stream::Append(const void* data, size_t byteCount) +{ + const uint8_t* current = (const uint8_t*)data; + + if (m_BufferSize > 0) + { + while (byteCount > 0 && m_BufferSize < BlockSize) + { + m_Buffer[m_BufferSize++] = *current++; + byteCount--; + } + } + + // full buffer + if (m_BufferSize == BlockSize) + { + ProcessBlock((void*)m_Buffer); + m_NumBytes += BlockSize; + m_BufferSize = 0; + } + + // no more data ? + if (byteCount == 0) + return *this; + + // process full blocks + while (byteCount >= BlockSize) + { + ProcessBlock(current); + current += BlockSize; + m_NumBytes += BlockSize; + byteCount -= BlockSize; + } + + // keep remaining bytes in buffer + while (byteCount > 0) + { + m_Buffer[m_BufferSize++] = *current++; + byteCount--; + } + + return *this; +} + +/// process final block, less than 64 bytes +void +SHA1Stream::ProcessBuffer() +{ + // the input bytes are considered as bits strings, where the first bit is the most significant bit of the byte + + // - append "1" bit to message + // - append "0" bits until message length in bit mod 512 is 448 + // - append length as 64 bit integer + + // number of bits + size_t paddedLength = m_BufferSize * 8; + + // plus one bit set to 1 (always appended) + paddedLength++; + + // number of bits must be (numBits % 512) = 448 + size_t lower11Bits = paddedLength & 511; + if (lower11Bits <= 448) + paddedLength += 448 - lower11Bits; + else + paddedLength += 512 + 448 - lower11Bits; + // convert from bits to bytes + paddedLength /= 8; + + // only needed if additional data flows over into a second block + unsigned char extra[BlockSize]; + + // append a "1" bit, 128 => binary 10000000 + if (m_BufferSize < BlockSize) + m_Buffer[m_BufferSize] = 128; + else + extra[0] = 128; + + size_t i; + for (i = m_BufferSize + 1; i < BlockSize; i++) + m_Buffer[i] = 0; + for (; i < paddedLength; i++) + extra[i - BlockSize] = 0; + + // add message length in bits as 64 bit number + uint64_t msgBits = 8 * (m_NumBytes + m_BufferSize); + // find right position + unsigned char* addLength; + if (paddedLength < BlockSize) + addLength = m_Buffer + paddedLength; + else + addLength = extra + paddedLength - BlockSize; + + // must be big endian + *addLength++ = (unsigned char)((msgBits >> 56) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 48) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 40) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 32) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 24) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 16) & 0xFF); + *addLength++ = (unsigned char)((msgBits >> 8) & 0xFF); + *addLength = (unsigned char)(msgBits & 0xFF); + + // process blocks + ProcessBlock(m_Buffer); + // flowed over into a second block ? + if (paddedLength > BlockSize) + ProcessBlock(extra); +} + +/// return latest hash as bytes +SHA1 +SHA1Stream::GetHash() +{ + SHA1 sha1; + // save old hash if buffer is partially filled + uint32_t oldHash[HashValues]; + for (int i = 0; i < HashValues; i++) + oldHash[i] = m_Hash[i]; + + // process remaining bytes + ProcessBuffer(); + + unsigned char* current = sha1.Hash; + for (int i = 0; i < HashValues; i++) + { + *current++ = (m_Hash[i] >> 24) & 0xFF; + *current++ = (m_Hash[i] >> 16) & 0xFF; + *current++ = (m_Hash[i] >> 8) & 0xFF; + *current++ = m_Hash[i] & 0xFF; + + // restore old hash + m_Hash[i] = oldHash[i]; + } + + return sha1; +} + +/// compute SHA1 of a memory block +SHA1 +SHA1Stream::Compute(const void* data, size_t byteCount) +{ + Reset(); + Append(data, byteCount); + return GetHash(); +} + +SHA1 +SHA1::HashMemory(const void* data, size_t byteCount) +{ + return SHA1Stream().Append(data, byteCount).GetHash(); +} + +SHA1 +SHA1::FromHexString(const char* string) +{ + SHA1 sha1; + + ParseHexBytes(string, 40, sha1.Hash); + + return sha1; +} + +const char* +SHA1::ToHexString(char* outString /* 40 characters + NUL terminator */) const +{ + ToHexBytes(Hash, sizeof(SHA1), outString); + outString[2 * sizeof(SHA1)] = '\0'; + + return outString; +} + +StringBuilderBase& +SHA1::ToHexString(StringBuilderBase& outBuilder) const +{ + char str[41]; + ToHexString(str); + + outBuilder.AppendRange(str, &str[40]); + + return outBuilder; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +sha1_forcelink() +{ +} + +doctest::String +toString(const SHA1& value) +{ + char sha1text[2 * sizeof(SHA1) + 1]; + value.ToHexString(sha1text); + + return sha1text; +} + +TEST_CASE("SHA1") +{ + uint8_t sha1_empty[20] = {0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, + 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09}; + SHA1 sha1z; + memcpy(sha1z.Hash, sha1_empty, sizeof sha1z.Hash); + + SUBCASE("Empty string") + { + SHA1 sha1 = SHA1::HashMemory(nullptr, 0); + + CHECK(sha1 == sha1z); + } + + SUBCASE("Empty stream") + { + SHA1Stream sha1s; + sha1s.Append(nullptr, 0); + sha1s.Append(nullptr, 0); + sha1s.Append(nullptr, 0); + CHECK(sha1s.GetHash() == sha1z); + } + + SUBCASE("SHA1 from string") + { + const SHA1 sha1empty = SHA1::FromHexString("da39a3ee5e6b4b0d3255bfef95601890afd80709"); + + CHECK(sha1z == sha1empty); + } + + SUBCASE("SHA1 to string") + { + char sha1str[41]; + sha1z.ToHexString(sha1str); + + CHECK(StringEquals(sha1str, "da39a3ee5e6b4b0d3255bfef95601890afd80709")); + } + + SUBCASE("Hash ABC") + { + const SHA1 sha1abc = SHA1::FromHexString("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8"); + + SHA1Stream sha1s; + + sha1s.Append("A", 1); + sha1s.Append("B", 1); + sha1s.Append("C", 1); + CHECK(sha1s.GetHash() == sha1abc); + + sha1s.Reset(); + sha1s.Append("AB", 2); + sha1s.Append("C", 1); + CHECK(sha1s.GetHash() == sha1abc); + + sha1s.Reset(); + sha1s.Append("ABC", 3); + CHECK(sha1s.GetHash() == sha1abc); + + sha1s.Reset(); + sha1s.Append("A", 1); + sha1s.Append("BC", 2); + CHECK(sha1s.GetHash() == sha1abc); + } +} + +} // namespace zen diff --git a/zencore/sharedbuffer.cpp b/zencore/sharedbuffer.cpp new file mode 100644 index 000000000..bc991053d --- /dev/null +++ b/zencore/sharedbuffer.cpp @@ -0,0 +1,110 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/sharedbuffer.h> + +#include <doctest/doctest.h> +#include <memory.h> + +#include <atlfile.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +BufferOwner::~BufferOwner() +{ + if (m_IsOwned) + { + Memory::Free(m_Data); + } +} + +////////////////////////////////////////////////////////////////////////// + +UniqueBuffer +UniqueBuffer::Alloc(uint64_t Size) +{ + void* Buffer = Memory::Alloc(Size, 16); + BufferOwner* Owner = new BufferOwner(Buffer, Size, /* owned */ true); + + return UniqueBuffer(Owner); +} + +UniqueBuffer +UniqueBuffer::MakeView(void* DataPtr, uint64_t Size) +{ + return UniqueBuffer(new BufferOwner(DataPtr, Size, /* owned */ false)); +} + +UniqueBuffer::UniqueBuffer(BufferOwner* Owner) : m_buffer(Owner) +{ +} + +////////////////////////////////////////////////////////////////////////// + +SharedBuffer::SharedBuffer(UniqueBuffer&& InBuffer) : m_buffer(std::move(InBuffer.m_buffer)) +{ +} + +void +SharedBuffer::MakeOwned() +{ + if (IsOwned() || !m_buffer) + return; + + const uint64_t Size = m_buffer->m_Size; + void* Buffer = Memory::Alloc(Size, 16); + auto NewOwner = new BufferOwner(Buffer, Size, /* owned */ true); + + memcpy(Buffer, m_buffer->m_Data, Size); + + m_buffer = NewOwner; +} + +SharedBuffer +SharedBuffer::MakeView(MemoryView View, SharedBuffer Buffer) +{ + // Todo: verify that view is within the shared buffer + + return SharedBuffer(new BufferOwner(const_cast<void*>(View.GetData()), View.GetSize(), /* owned */ false, Buffer.m_buffer)); +} + +SharedBuffer +SharedBuffer::MakeView(const void* Data, uint64_t Size) +{ + return SharedBuffer(new BufferOwner(const_cast<void*>(Data), Size, /* owned */ false)); +} + +SharedBuffer +SharedBuffer::Clone() +{ + const uint64_t Size = GetSize(); + void* Buffer = Memory::Alloc(Size, 16); + auto NewOwner = new BufferOwner(Buffer, Size, /* owned */ true); + memcpy(Buffer, m_buffer->m_Data, Size); + + return SharedBuffer(NewOwner); +} + +SharedBuffer +SharedBuffer::Clone(MemoryView View) +{ + const uint64_t Size = View.GetSize(); + void* Buffer = Memory::Alloc(Size, 16); + auto NewOwner = new BufferOwner(Buffer, Size, /* owned */ true); + memcpy(Buffer, View.GetData(), Size); + + return SharedBuffer(NewOwner); +} + +////////////////////////////////////////////////////////////////////////// + +void +sharedbuffer_forcelink() +{ +} + +TEST_CASE("SharedBuffer") +{ +} + +} // namespace zen diff --git a/zencore/snapshot_manifest.cpp b/zencore/snapshot_manifest.cpp new file mode 100644 index 000000000..7d0769d13 --- /dev/null +++ b/zencore/snapshot_manifest.cpp @@ -0,0 +1,281 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <doctest/doctest.h> +#include <zencore/snapshot_manifest.h> +#include <zencore/stream.h> +#include <zencore/streamutil.h> +#include <zencore/string.h> +#include <ostream> + +#include <filesystem> + +#include <atlbase.h> + +// Used for getting My Documents for default snapshot dir +#include <ShlObj.h> +#pragma comment(lib, "shell32.lib") + +namespace zen { + +constexpr const char* magicString = "-=- ZEN_SNAP -=-"; + +struct SerializedManifestHeader +{ + char Magic[16]; + + void init() { memcpy(Magic, magicString, sizeof Magic); } + bool verify() const { return memcmp(Magic, magicString, sizeof Magic) == 0; } +}; + +TextWriter& +operator<<(TextWriter& Writer, const LeafNode& Leaf) +{ + Writer << "modTime: " << Leaf.FileModifiedTime << ", size: " << Leaf.FileSize << ", hash: " << Leaf.ChunkHash << ", name: " << Leaf.Name + << "\n"; + + return Writer; +} + +BinaryWriter& +operator<<(BinaryWriter& Writer, const LeafNode& Leaf) +{ + Writer << Leaf.FileModifiedTime << Leaf.FileSize << Leaf.ChunkHash << Leaf.Name; + + return Writer; +} + +BinaryReader& +operator>>(BinaryReader& Reader, LeafNode& Leaf) +{ + Reader >> Leaf.FileModifiedTime >> Leaf.FileSize >> Leaf.ChunkHash >> Leaf.Name; + + return Reader; +} + +void +TreeNode::Finalize() +{ + zen::BLAKE3Stream Blake3Stream; + + for (auto& Node : Children) + { + Node.Finalize(); + Blake3Stream.Append(Node.ChunkHash.Hash, sizeof Node.ChunkHash); + Blake3Stream.Append(Node.Name.data(), Node.Name.size() + 1); + } + + for (auto& leaf : Leaves) + { + Blake3Stream.Append(leaf.ChunkHash.Hash, sizeof leaf.ChunkHash); + Blake3Stream.Append(leaf.Name.data(), leaf.Name.size() + 1); + } + + this->ChunkHash = Blake3Stream.GetHash(); +} + +void +TreeNode::VisitFiles(std::function<void(const LeafNode& node)> func) +{ + for (auto& Node : Children) + Node.VisitFiles(func); + + for (auto& Leaf : Leaves) + func(Leaf); +} + +void +TreeNode::VisitModifyFiles(std::function<void(LeafNode& node)> func) +{ + for (auto& Node : Children) + Node.VisitModifyFiles(func); + + for (auto& Leaf : Leaves) + func(Leaf); +} + +IndentTextWriter& +operator<<(IndentTextWriter& Writer, const TreeNode& Node) +{ + Writer << "hash: " << Node.ChunkHash << ", name: " << Node.Name << "\n"; + + if (!Node.Leaves.empty()) + { + Writer << "files: " + << "\n"; + + IndentTextWriter::Scope _(Writer); + + for (const LeafNode& Leaf : Node.Leaves) + Writer << Leaf; + } + + if (!Node.Children.empty()) + { + Writer << "children: " + << "\n"; + + IndentTextWriter::Scope _(Writer); + + for (const TreeNode& Child : Node.Children) + { + Writer << Child; + } + } + + return Writer; +} + +BinaryWriter& +operator<<(BinaryWriter& Writer, const TreeNode& Node) +{ + Writer << Node.ChunkHash << Node.Name; + Writer << uint32_t(Node.Children.size()); + + for (const TreeNode& child : Node.Children) + Writer << child; + + Writer << uint32_t(Node.Leaves.size()); + + for (const LeafNode& Leaf : Node.Leaves) + Writer << Leaf; + + return Writer; +} + +BinaryReader& +operator>>(BinaryReader& Reader, TreeNode& Node) +{ + Reader >> Node.ChunkHash >> Node.Name; + + uint32_t ChildCount = 0; + Reader >> ChildCount; + Node.Children.resize(ChildCount); + + for (TreeNode& Child : Node.Children) + Reader >> Child; + + uint32_t LeafCount = 0; + Reader >> LeafCount; + Node.Leaves.resize(LeafCount); + + for (LeafNode& Leaf : Node.Leaves) + Reader >> Leaf; + + return Reader; +} + +void +SnapshotManifest::finalize() +{ + Root.Finalize(); + + zen::BLAKE3Stream Blake3Stream; + + Blake3Stream.Append(Root.ChunkHash.Hash, sizeof Root.ChunkHash); + Blake3Stream.Append(Root.Name.data(), Root.Name.size() + 1); + + this->ChunkHash = Blake3Stream.GetHash(); +} + +void +WriteManifest(const SnapshotManifest& Manifest, OutStream& ToStream) +{ + BinaryWriter Out(ToStream); + SerializedManifestHeader Header; + Header.init(); + Out.Write(&Header, sizeof Header); + + Out << Manifest.ChunkHash << Manifest.Id << Manifest.Root; +} + +void +ReadManifest(SnapshotManifest& Manifest, InStream& FromStream) +{ + BinaryReader Reader(FromStream); + SerializedManifestHeader Header; + Reader.Read(&Header, sizeof Header); + + Reader >> Manifest.ChunkHash >> Manifest.Id >> Manifest.Root; +} + +void +PrintManifest(const SnapshotManifest& Manifest, OutStream& ToStream) +{ + IndentTextWriter Writer(ToStream); + + Writer << "hash: " << Manifest.ChunkHash << "\n"; + Writer << "id: " << Manifest.Id << "\n"; + Writer << "root: " + << "\n"; + IndentTextWriter::Scope _(Writer); + Writer << Manifest.Root; +} + +std::filesystem::path +ManifestSpecToPath(const char* ManifestSpec) +{ + ExtendableWideStringBuilder<128> ManifestTargetFile; + + if (ManifestSpec[0] == '#') + { + // Pick sensible default + + WCHAR MyDocumentsDir[MAX_PATH]; + HRESULT hRes = SHGetFolderPathW(NULL, + CSIDL_PERSONAL /* My Documents */, + NULL, + SHGFP_TYPE_CURRENT, + /* out */ MyDocumentsDir); + + if (SUCCEEDED(hRes)) + { + wcscat_s(MyDocumentsDir, L"\\zenfs\\Snapshots\\"); + + ManifestTargetFile.Append(MyDocumentsDir); + ManifestTargetFile.AppendAscii(ManifestSpec + 1); + } + } + else + { + ManifestTargetFile.AppendAscii(ManifestSpec); + } + + std::filesystem::path ManifestPath{ManifestTargetFile.c_str()}; + + if (ManifestPath.extension() != L".zenfs") + { + ManifestPath.append(L".zenfs"); + } + + return ManifestPath; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +snapshotmanifest_forcelink() +{ +} + +TEST_CASE("Snapshot manifest") +{ + SnapshotManifest Manifest; + + Manifest.Id = "test_manifest"; + Manifest.ChunkHash = zen::BLAKE3::HashMemory("abcd", 4); + + MemoryOutStream Outstream; + WriteManifest(Manifest, Outstream); + + MemoryInStream Instream(Outstream.Data(), Outstream.Size()); + SnapshotManifest Manifest2; + ReadManifest(/* out */ Manifest2, Instream); + + CHECK(Manifest.Id == Manifest2.Id); + CHECK(Manifest.ChunkHash == Manifest2.ChunkHash); +} + +} // namespace zen diff --git a/zencore/stats.cpp b/zencore/stats.cpp new file mode 100644 index 000000000..f8cdc8fbb --- /dev/null +++ b/zencore/stats.cpp @@ -0,0 +1,73 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/stats.h" +#include <doctest/doctest.h> +#include <cmath> +#include "zencore/timer.h" + +// +// Derived from https://github.com/dln/medida/blob/master/src/medida/stats/ewma.cc +// + +namespace zen { + +static constexpr int kTickInterval = 5; // In seconds +static constexpr double kSecondsPerMinute = 60.0; +static constexpr int kOneMinute = 1; +static constexpr int kFiveMinutes = 5; +static constexpr int kFifteenMinutes = 15; + +static double kM1_ALPHA = 1.0 - std::exp(-kTickInterval / kSecondsPerMinute / kOneMinute); +static double kM5_ALPHA = 1.0 - std::exp(-kTickInterval / kSecondsPerMinute / kFiveMinutes); +static double kM15_ALPHA = 1.0 - std::exp(-kTickInterval / kSecondsPerMinute / kFifteenMinutes); + +static uint64_t CountPerTick = GetHifreqTimerFrequencySafe() * kTickInterval; +static uint64_t CountPerSecond = GetHifreqTimerFrequencySafe(); + +void +EWMA::Tick(double Alpha, uint64_t Interval, uint64_t Count, bool IsInitialUpdate) +{ + double InstantRate = double(Count) / Interval; + + if (IsInitialUpdate) + { + m_rate = InstantRate; + } + else + { + m_rate += Alpha * (InstantRate - m_rate); + } +} + +double +EWMA::Rate() const +{ + return m_rate * CountPerSecond; +} + +////////////////////////////////////////////////////////////////////////// + +TEST_CASE("Stats") +{ + SUBCASE("Simple") + { + EWMA ewma1; + ewma1.Tick(kM1_ALPHA, CountPerSecond, 5, true); + + CHECK(ewma1.Rate() - 5 < 0.001); + + for (int i = 0; i < 60; ++i) + ewma1.Tick(kM1_ALPHA, CountPerSecond, 10, false); + + CHECK(ewma1.Rate() - 10 < 0.001); + + ewma1.Tick(kM1_ALPHA, CountPerSecond, 10, false); + } +} + +void +stats_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/stream.cpp b/zencore/stream.cpp new file mode 100644 index 000000000..2cda0e123 --- /dev/null +++ b/zencore/stream.cpp @@ -0,0 +1,307 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <doctest/doctest.h> +#include <stdarg.h> +#include <zencore/memory.h> +#include <zencore/stream.h> +#include <algorithm> +#include <exception> + +namespace zen { + +MemoryInStream::MemoryInStream(const void* buffer, size_t size) +: m_Buffer(reinterpret_cast<const uint8_t*>(buffer), reinterpret_cast<const uint8_t*>(buffer) + size) +{ +} + +void +MemoryInStream::Read(void* buffer, size_t byteCount, uint64_t offset) +{ + RwLock::ExclusiveLockScope _(m_Lock); + + const size_t needEnd = offset + byteCount; + + if (needEnd > m_Buffer.size()) + throw std::exception("read past end of file!"); // TODO: better exception + + memcpy(buffer, m_Buffer.data() + offset, byteCount); +} + +void +MemoryOutStream::Write(const void* data, size_t byteCount, uint64_t offset) +{ + RwLock::ExclusiveLockScope _(m_Lock); + + const size_t needEnd = offset + byteCount; + + if (needEnd > m_Buffer.size()) + m_Buffer.resize(needEnd); + + memcpy(m_Buffer.data() + offset, data, byteCount); +} + +void +MemoryOutStream::Flush() +{ + // No-op +} + +////////////////////////////////////////////////////////////////////////// + +TextWriter::TextWriter(OutStream& stream) : m_Stream(&stream) +{ +} + +TextWriter::~TextWriter() = default; + +void +TextWriter::Write(const void* data, size_t byteCount) +{ + m_Stream->Write(data, byteCount, m_CurrentOffset); + m_CurrentOffset += byteCount; +} + +TextWriter& +operator<<(TextWriter& Writer, const char* value) +{ + if (value) + Writer.Write(value, strlen(value)); + else + Writer.Write("(null)", 6); + + return Writer; +} + +TextWriter& +operator<<(TextWriter& writer, const std::string_view& value) +{ + writer.Write(value.data(), value.size()); + + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, bool value) +{ + if (value) + writer.Write("true", 4); + else + writer.Write("false", 5); + + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, int8_t value) +{ + char buffer[16]; + _itoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, int16_t value) +{ + char buffer[16]; + _itoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, int32_t value) +{ + char buffer[16]; + _itoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, int64_t value) +{ + char buffer[32]; + _i64toa_s(value, buffer, sizeof buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, uint8_t value) +{ + char buffer[16]; + _ultoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, uint16_t value) +{ + char buffer[16]; + _ultoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, uint32_t value) +{ + char buffer[16]; + _ultoa_s(value, buffer, 10); + writer << buffer; + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, uint64_t value) +{ + char buffer[32]; + _ui64toa_s(value, buffer, sizeof buffer, 10); + writer << buffer; + return writer; +} + +void +TextWriter::Writef(const char* formatString, ...) +{ + va_list args; + va_start(args, formatString); + + char* tempBuffer = nullptr; + char buffer[4096]; + int rv = vsnprintf(buffer, sizeof buffer, formatString, args); + + ZEN_ASSERT(rv >= 0); + + if (rv > sizeof buffer) + { + // Need more room -- allocate temporary buffer + + tempBuffer = (char*)Memory::Alloc(rv + 1, 8); + + int rv2 = vsnprintf(tempBuffer, rv + 1, formatString, args); + + ZEN_ASSERT(rv >= 0); + ZEN_ASSERT(rv2 <= rv); + + rv = rv2; + } + + m_Stream->Write(tempBuffer ? tempBuffer : buffer, rv, m_CurrentOffset); + m_CurrentOffset += rv; + + if (tempBuffer) + Memory::Free(tempBuffer); + + va_end(args); +} + +////////////////////////////////////////////////////////////////////////// + +IndentTextWriter::IndentTextWriter(OutStream& stream) : TextWriter(stream) +{ +} + +IndentTextWriter::~IndentTextWriter() +{ +} + +void +IndentTextWriter::Write(const void* data, size_t byteCount) +{ + const uint8_t* src = reinterpret_cast<const uint8_t*>(data); + int cur = m_LineCursor; + + while (byteCount) + { + char c = *src++; + + if (cur == 0) + { + const char indentSpaces[] = + " " + " "; + + cur = std::min<int>(m_IndentAmount, sizeof indentSpaces - 1); + memcpy(m_LineBuffer, indentSpaces, cur); + } + + m_LineBuffer[cur++] = c; + --byteCount; + + if (c == '\n' || cur == sizeof m_LineBuffer) + { + TextWriter::Write(m_LineBuffer, cur); + + cur = 0; + } + } + + m_LineCursor = cur; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +stream_forcelink() +{ +} + +TEST_CASE("BinaryWriter and BinaryWriter") +{ + MemoryOutStream stream; + BinaryWriter writer(stream); + + CHECK(writer.CurrentOffset() == 0); + + writer.Write("foo!", 4); + CHECK(writer.CurrentOffset() == 4); + + writer << uint8_t(42) << uint16_t(42) << uint32_t(42) << uint64_t(42); + writer << int8_t(42) << int16_t(42) << int32_t(42) << int64_t(42); + + CHECK(writer.CurrentOffset() == (4 + 15 * 2)); + + // Read the data back + + MemoryInStream instream(stream.Data(), stream.Size()); + BinaryReader reader(instream); + CHECK(reader.CurrentOffset() == 0); + + char buffer[4]; + reader.Read(buffer, 4); + CHECK(reader.CurrentOffset() == 4); + + CHECK(memcmp(buffer, "foo!", 4) == 0); + + uint8_t ui8 = 0; + uint16_t ui16 = 0; + uint32_t ui32 = 0; + uint64_t ui64 = 0; + int8_t i8 = 0; + int16_t i16 = 0; + int32_t i32 = 0; + int64_t i64 = 0; + + reader >> ui8 >> ui16 >> ui32 >> ui64; + reader >> i8 >> i16 >> i32 >> i64; + + CHECK(reader.CurrentOffset() == (4 + 15 * 2)); + + CHECK(ui8 == 42); + CHECK(ui16 == 42); + CHECK(ui32 == 42); + CHECK(ui64 == 42); + + CHECK(i8 == 42); + CHECK(i16 == 42); + CHECK(i32 == 42); + CHECK(i64 == 42); +} + +} // namespace zen diff --git a/zencore/streamutil.cpp b/zencore/streamutil.cpp new file mode 100644 index 000000000..d3ed5ceaa --- /dev/null +++ b/zencore/streamutil.cpp @@ -0,0 +1,104 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/streamutil.h> +#include <zencore/string.h> + +namespace zen { + +BinaryWriter& +operator<<(BinaryWriter& writer, const std::string_view& value) +{ + writer.Write(value.data(), value.size()); + writer << uint8_t(0); + + return writer; +} + +BinaryReader& +operator>>(BinaryReader& reader, std::string& value) +{ + for (;;) + { + uint8_t x; + reader.Read(&x, 1); + + if (x == 0) + return reader; + + value.push_back(char(x)); + } +} + +BinaryWriter& +operator<<(BinaryWriter& writer, const std::wstring_view& value) +{ + // write as utf8 + + ExtendableStringBuilder<128> utf8; + WideToUtf8(value, utf8); + + writer.Write(utf8.c_str(), utf8.Size() + 1); + + return writer; +} + +BinaryReader& +operator>>(BinaryReader& reader, std::wstring& value) +{ + // read as utf8 + + std::string v8; + reader >> v8; + + ExtendableWideStringBuilder<128> wstr; + Utf8ToWide(v8, wstr); + + value = wstr.c_str(); + + return reader; +} + +TextWriter& +operator<<(TextWriter& writer, const zen::SHA1& value) +{ + zen::SHA1::String_t buffer; + value.ToHexString(buffer); + + writer.Write(buffer, zen::SHA1::StringLength); + + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, const zen::BLAKE3& value) +{ + zen::BLAKE3::String_t buffer; + value.ToHexString(buffer); + + writer.Write(buffer, zen::BLAKE3::StringLength); + + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, const zen::IoHash& value) +{ + zen::IoHash::String_t buffer; + value.ToHexString(buffer); + + writer.Write(buffer, zen::IoHash::StringLength); + + return writer; +} + +TextWriter& +operator<<(TextWriter& writer, const std::wstring_view& value) +{ + ExtendableStringBuilder<128> v8; + WideToUtf8(value, v8); + + writer.Write(v8.c_str(), v8.Size()); + return writer; +} + +} // namespace zen diff --git a/zencore/string.cpp b/zencore/string.cpp new file mode 100644 index 000000000..b6093ac2e --- /dev/null +++ b/zencore/string.cpp @@ -0,0 +1,913 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <doctest/doctest.h> +#include <inttypes.h> +#include <stdio.h> +#include <zencore/memory.h> +#include <zencore/string.h> +#include <exception> +#include <ostream> + +#include <utf8.h> + +template<typename u16bit_iterator> +void +utf16to8_impl(u16bit_iterator StartIt, u16bit_iterator EndIt, ::zen::StringBuilderBase& OutString) +{ + while (StartIt != EndIt) + { + uint32_t cp = utf8::internal::mask16(*StartIt++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) + { + uint32_t trail_surrogate = utf8::internal::mask16(*StartIt++); + cp = (cp << 10) + trail_surrogate + utf8::internal::SURROGATE_OFFSET; + } + OutString.AppendCodepoint(cp); + } +} + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +bool +ToString(std::span<char> Buffer, uint64_t Num) +{ + snprintf(Buffer.data(), Buffer.size(), "%I64u", Num); + + return true; +} +bool +ToString(std::span<char> Buffer, int64_t Num) +{ + snprintf(Buffer.data(), Buffer.size(), "%I64d", Num); + + return true; +} + +////////////////////////////////////////////////////////////////////////// + +const char* +FilepathFindExtension(const std::string_view& Path, const char* ExtensionToMatch) +{ + const size_t PathLen = Path.size(); + + if (ExtensionToMatch) + { + size_t ExtLen = strlen(ExtensionToMatch); + + if (ExtLen > PathLen) + return nullptr; + + const char* PathExtension = Path.data() + PathLen - ExtLen; + + if (StringEquals(PathExtension, ExtensionToMatch)) + return PathExtension; + + return nullptr; + } + + if (PathLen == 0) + return nullptr; + + // Look for extension introducer ('.') + + for (size_t i = PathLen - 1; i >= 0; --i) + { + if (Path[i] == '.') + return Path.data() + i; + } + + return nullptr; +} + +////////////////////////////////////////////////////////////////////////// + +void +Utf8ToWide(const char8_t* Str8, WideStringBuilderBase& OutString) +{ + Utf8ToWide(std::u8string_view(Str8), OutString); +} + +void +Utf8ToWide(const std::string_view& Str8, WideStringBuilderBase& OutString) +{ + Utf8ToWide(std::u8string_view{reinterpret_cast<const char8_t*>(Str8.data()), Str8.size()}, OutString); +} + +std::wstring +Utf8ToWide(const std::string_view& Wstr) +{ + ExtendableWideStringBuilder<128> String; + Utf8ToWide(Wstr, String); + + return String.c_str(); +} + +void +Utf8ToWide(const std::u8string_view& Str8, WideStringBuilderBase& OutString) +{ + const char* str = (const char*)Str8.data(); + const size_t strLen = Str8.size(); + + const char* endStr = str + strLen; + size_t ByteCount = 0; + size_t CurrentOutChar = 0; + + for (; str != endStr; ++str) + { + unsigned char Data = static_cast<unsigned char>(*str); + + if (!(Data & 0x80)) + { + // ASCII + OutString.Append(wchar_t(Data)); + continue; + } + else if (!ByteCount) + { + // Start of multi-byte sequence. Figure out how + // many bytes we're going to consume + + size_t Count = 0; + + for (size_t Temp = Data; Temp & 0x80; Temp <<= 1) + ++Count; + + ByteCount = Count - 1; + CurrentOutChar = Data & (0xff >> (Count + 1)); + } + else + { + --ByteCount; + + if ((Data & 0xc0) != 0x80) + { + break; + } + + CurrentOutChar = (CurrentOutChar << 6) | (Data & 0x3f); + + if (!ByteCount) + { + OutString.Append(wchar_t(CurrentOutChar)); + CurrentOutChar = 0; + } + } + } +} + +void +WideToUtf8(const wchar_t* Wstr, StringBuilderBase& OutString) +{ + WideToUtf8(std::u16string_view{(char16_t*)Wstr}, OutString); +} + +void +WideToUtf8(const std::wstring_view& Wstr, StringBuilderBase& OutString) +{ + WideToUtf8(std::u16string_view{(char16_t*)Wstr.data(), Wstr.size()}, OutString); +} + +void +WideToUtf8(const std::u16string_view& Wstr, StringBuilderBase& OutString) +{ + utf16to8_impl(begin(Wstr), end(Wstr), OutString); +} + +std::string +WideToUtf8(const wchar_t* Wstr) +{ + ExtendableStringBuilder<128> String; + WideToUtf8(std::u16string_view{(char16_t*)Wstr}, String); + + return String.c_str(); +} + +std::string +WideToUtf8(const std::wstring_view Wstr) +{ + ExtendableStringBuilder<128> String; + WideToUtf8(std::u16string_view{(char16_t*)Wstr.data(), Wstr.size()}, String); + + return String.c_str(); +} + +////////////////////////////////////////////////////////////////////////// + +enum NicenumFormat +{ + kNicenum1024 = 0, // Print kilo, mega, tera, peta, exa.. + kNicenumBytes = 1, // Print single bytes ("13B"), kilo, mega, tera... + kNicenumTime = 2, // Print nanosecs, microsecs, millisecs, seconds... + kNicenumRaw = 3, // Print the raw number without any formatting + kNicenumRawTime = 4 // Same as RAW, but print dashes ('-') for zero. +}; + +namespace { + static const char* UnitStrings[3][7] = { + /* kNicenum1024 */ {"", "K", "M", "G", "T", "P", "E"}, + /* kNicenumBytes */ {"B", "K", "M", "G", "T", "P", "E"}, + /* kNicenumTime */ {"ns", "us", "ms", "s", "?", "?", "?"}}; + + static const int UnitsLen[] = { + /* kNicenum1024 */ 6, + /* kNicenumBytes */ 6, + /* kNicenumTime */ 3}; + + static const uint64_t KiloUnit[] = { + /* kNicenum1024 */ 1024, + /* kNicenumBytes */ 1024, + /* kNicenumTime */ 1000}; +} // namespace + +/* + * Convert a number to an appropriately human-readable output. + */ +int +NiceNumGeneral(uint64_t Num, std::span<char> Buffer, NicenumFormat Format) +{ + switch (Format) + { + case kNicenumRaw: + return snprintf(Buffer.data(), Buffer.size(), "%llu", (uint64_t)Num); + + case kNicenumRawTime: + if (Num > 0) + { + return snprintf(Buffer.data(), Buffer.size(), "%llu", (uint64_t)Num); + } + else + { + return snprintf(Buffer.data(), Buffer.size(), "%s", "-"); + } + break; + + case kNicenum1024: + case kNicenumBytes: + case kNicenumTime: + default: + break; + } + + // Bring into range and select unit + + int Index = 0; + uint64_t n = Num; + + { + const uint64_t Unit = KiloUnit[Format]; + const int maxIndex = UnitsLen[Format]; + + while (n >= Unit && Index < maxIndex) + { + n /= Unit; + Index++; + } + } + + const char* u = UnitStrings[Format][Index]; + + if ((Index == 0) || ((Num % (uint64_t)powl((int)KiloUnit[Format], Index)) == 0)) + { + /* + * If this is an even multiple of the base, always display + * without any decimal precision. + */ + return snprintf(Buffer.data(), Buffer.size(), "%llu%s", (uint64_t)n, u); + } + else + { + /* + * We want to choose a precision that reflects the best choice + * for fitting in 5 characters. This can get rather tricky when + * we have numbers that are very close to an order of magnitude. + * For example, when displaying 10239 (which is really 9.999K), + * we want only a single place of precision for 10.0K. We could + * develop some complex heuristics for this, but it's much + * easier just to try each combination in turn. + */ + + int StrLen = 0; + + for (int i = 2; i >= 0; i--) + { + double Value = (double)Num / (uint64_t)powl((int)KiloUnit[Format], Index); + + /* + * Don't print floating point values for time. Note, + * we use floor() instead of round() here, since + * round can result in undesirable results. For + * example, if "num" is in the range of + * 999500-999999, it will print out "1000us". This + * doesn't happen if we use floor(). + */ + if (Format == kNicenumTime) + { + StrLen = snprintf(Buffer.data(), Buffer.size(), "%d%s", (unsigned int)floor(Value), u); + + if (StrLen <= 5) + break; + } + else + { + StrLen = snprintf(Buffer.data(), Buffer.size(), "%.*f%s", i, Value, u); + + if (StrLen <= 5) + break; + } + } + + return StrLen; + } +} + +size_t +NiceNumToBuffer(uint64_t Num, std::span<char> Buffer) +{ + return NiceNumGeneral(Num, Buffer, kNicenum1024); +} + +size_t +NiceBytesToBuffer(uint64_t Num, std::span<char> Buffer) +{ + return NiceNumGeneral(Num, Buffer, kNicenumBytes); +} + +size_t +NiceByteRateToBuffer(uint64_t Num, uint64_t ElapsedMs, std::span<char> Buffer) +{ + size_t n = NiceNumGeneral(Num * 1000 / ElapsedMs, Buffer, kNicenumBytes); + + Buffer[n++] = '/'; + Buffer[n++] = 's'; + Buffer[n++] = '\0'; + + return n; +} + +size_t +NiceLatencyNsToBuffer(uint64_t Nanos, std::span<char> Buffer) +{ + return NiceNumGeneral(Nanos, Buffer, kNicenumTime); +} + +size_t +NiceTimeSpanMsToBuffer(uint64_t Millis, std::span<char> Buffer) +{ + if (Millis < 1000) + { + return snprintf(Buffer.data(), Buffer.size(), "%" PRIu64 "ms", Millis); + } + else if (Millis < 10000) + { + return snprintf(Buffer.data(), Buffer.size(), "%.2fs", Millis / 1000.0); + } + else if (Millis < 60000) + { + return snprintf(Buffer.data(), Buffer.size(), "%.1fs", Millis / 1000.0); + } + else if (Millis < 60 * 60000) + { + return snprintf(Buffer.data(), Buffer.size(), "%" PRIu64 "m%02" PRIu64 "s", Millis / 60000, (Millis / 1000) % 60); + } + else + { + return snprintf(Buffer.data(), Buffer.size(), "%" PRIu64 "h%02" PRIu64 "m", Millis / 3600000, (Millis / 60000) % 60); + } +} + +////////////////////////////////////////////////////////////////////////// + +template<typename C> +StringBuilderImpl<C>::~StringBuilderImpl() +{ + if (m_IsDynamic) + { + FreeBuffer(m_Base, m_End - m_Base); + } +} + +template<typename C> +void +StringBuilderImpl<C>::Extend(size_t extraCapacity) +{ + if (!m_IsExtendable) + { + Fail("exceeded capacity"); + } + + const size_t oldCapacity = m_End - m_Base; + const size_t newCapacity = NextPow2(oldCapacity + extraCapacity); + + C* newBase = (C*)AllocBuffer(newCapacity); + + size_t pos = m_CurPos - m_Base; + memcpy(newBase, m_Base, pos * sizeof(C)); + + if (m_IsDynamic) + { + FreeBuffer(m_Base, oldCapacity); + } + + m_Base = newBase; + m_CurPos = newBase + pos; + m_End = newBase + newCapacity; + m_IsDynamic = true; +} + +template<typename C> +void* +StringBuilderImpl<C>::AllocBuffer(size_t byteCount) +{ + return Memory::Alloc(byteCount * sizeof(C)); +} + +template<typename C> +void +StringBuilderImpl<C>::FreeBuffer(void* buffer, size_t byteCount) +{ + ZEN_UNUSED(byteCount); + + Memory::Free(buffer); +} + +template<typename C> +[[noreturn]] void +StringBuilderImpl<C>::Fail(const char* reason) +{ + throw std::exception(reason); +} + +// Instantiate templates once + +template class StringBuilderImpl<char>; +template class StringBuilderImpl<wchar_t>; + +////////////////////////////////////////////////////////////////////////// +// +// Unit tests +// + +TEST_CASE("niceNum") +{ + char Buffer[16]; + + SUBCASE("raw") + { + NiceNumGeneral(1, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "1")); + + NiceNumGeneral(10, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "10")); + + NiceNumGeneral(100, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "100")); + + NiceNumGeneral(1000, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "1000")); + + NiceNumGeneral(10000, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "10000")); + + NiceNumGeneral(100000, Buffer, kNicenumRaw); + CHECK(StringEquals(Buffer, "100000")); + } + + SUBCASE("1024") + { + NiceNumGeneral(1, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1")); + + NiceNumGeneral(10, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "10")); + + NiceNumGeneral(100, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "100")); + + NiceNumGeneral(1000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1000")); + + NiceNumGeneral(10000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.77K")); + + NiceNumGeneral(100000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "97.7K")); + + NiceNumGeneral(1000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "977K")); + + NiceNumGeneral(10000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.54M")); + + NiceNumGeneral(100000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "95.4M")); + + NiceNumGeneral(1000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "954M")); + + NiceNumGeneral(10000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.31G")); + + NiceNumGeneral(100000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "93.1G")); + + NiceNumGeneral(1000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "931G")); + + NiceNumGeneral(10000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.09T")); + + NiceNumGeneral(100000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "90.9T")); + + NiceNumGeneral(1000000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "909T")); + + NiceNumGeneral(10000000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "8.88P")); + + NiceNumGeneral(100000000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "88.8P")); + + NiceNumGeneral(1000000000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "888P")); + + NiceNumGeneral(10000000000000000000, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "8.67E")); + + // pow2 + + NiceNumGeneral(0, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "0")); + + NiceNumGeneral(1, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1")); + + NiceNumGeneral(1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1K")); + + NiceNumGeneral(1024 * 1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1M")); + + NiceNumGeneral(1024 * 1024 * 1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1G")); + + NiceNumGeneral(1024llu * 1024 * 1024 * 1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1T")); + + NiceNumGeneral(1024llu * 1024 * 1024 * 1024 * 1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1P")); + + NiceNumGeneral(1024llu * 1024 * 1024 * 1024 * 1024 * 1024, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1E")); + + // pow2-1 + + NiceNumGeneral(1023, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "1023")); + + NiceNumGeneral(2047, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "2.00K")); + + NiceNumGeneral(9 * 1024 - 1, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.00K")); + + NiceNumGeneral(10 * 1024 - 1, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "10.0K")); + + NiceNumGeneral(10 * 1024 - 5, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "10.0K")); + + NiceNumGeneral(10 * 1024 - 6, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.99K")); + + NiceNumGeneral(10 * 1024 - 10, Buffer, kNicenum1024); + CHECK(StringEquals(Buffer, "9.99K")); + } + + SUBCASE("time") + { + NiceNumGeneral(1, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "1ns")); + + NiceNumGeneral(100, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "100ns")); + + NiceNumGeneral(1000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "1us")); + + NiceNumGeneral(10000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "10us")); + + NiceNumGeneral(100000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "100us")); + + NiceNumGeneral(1000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "1ms")); + + NiceNumGeneral(10000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "10ms")); + + NiceNumGeneral(100000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "100ms")); + + NiceNumGeneral(1000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "1s")); + + NiceNumGeneral(10000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "10s")); + + NiceNumGeneral(100000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "100s")); + + NiceNumGeneral(1000000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "1000s")); + + NiceNumGeneral(10000000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "10000s")); + + NiceNumGeneral(100000000000000, Buffer, kNicenumTime); + CHECK(StringEquals(Buffer, "100000s")); + } + + SUBCASE("bytes") + { + NiceNumGeneral(1, Buffer, kNicenumBytes); + CHECK(StringEquals(Buffer, "1B")); + + NiceNumGeneral(10, Buffer, kNicenumBytes); + CHECK(StringEquals(Buffer, "10B")); + + NiceNumGeneral(100, Buffer, kNicenumBytes); + CHECK(StringEquals(Buffer, "100B")); + + NiceNumGeneral(1000, Buffer, kNicenumBytes); + CHECK(StringEquals(Buffer, "1000B")); + + NiceNumGeneral(10000, Buffer, kNicenumBytes); + CHECK(StringEquals(Buffer, "9.77K")); + } + + SUBCASE("byteRate") + { + NiceByteRateToBuffer(1, 1, Buffer); + CHECK(StringEquals(Buffer, "1000B/s")); + + NiceByteRateToBuffer(1000, 1000, Buffer); + CHECK(StringEquals(Buffer, "1000B/s")); + + NiceByteRateToBuffer(1024, 1, Buffer); + CHECK(StringEquals(Buffer, "1000K/s")); + + NiceByteRateToBuffer(1024, 1000, Buffer); + CHECK(StringEquals(Buffer, "1K/s")); + } + + SUBCASE("timespan") + { + NiceTimeSpanMsToBuffer(1, Buffer); + CHECK(StringEquals(Buffer, "1ms")); + + NiceTimeSpanMsToBuffer(900, Buffer); + CHECK(StringEquals(Buffer, "900ms")); + + NiceTimeSpanMsToBuffer(1000, Buffer); + CHECK(StringEquals(Buffer, "1.00s")); + + NiceTimeSpanMsToBuffer(1900, Buffer); + CHECK(StringEquals(Buffer, "1.90s")); + + NiceTimeSpanMsToBuffer(19000, Buffer); + CHECK(StringEquals(Buffer, "19.0s")); + + NiceTimeSpanMsToBuffer(60000, Buffer); + CHECK(StringEquals(Buffer, "1m00s")); + + NiceTimeSpanMsToBuffer(600000, Buffer); + CHECK(StringEquals(Buffer, "10m00s")); + + NiceTimeSpanMsToBuffer(3600000, Buffer); + CHECK(StringEquals(Buffer, "1h00m")); + + NiceTimeSpanMsToBuffer(36000000, Buffer); + CHECK(StringEquals(Buffer, "10h00m")); + + NiceTimeSpanMsToBuffer(360000000, Buffer); + CHECK(StringEquals(Buffer, "100h00m")); + } +} + +void +string_forcelink() +{ +} + +TEST_CASE("StringBuilder") +{ + StringBuilder<64> sb; + + SUBCASE("Empty init") + { + const char* str = sb.c_str(); + + CHECK(StringLength(str) == 0); + } + + SUBCASE("Append single character") + { + sb.Append('a'); + + const char* str = sb.c_str(); + CHECK(StringLength(str) == 1); + CHECK(str[0] == 'a'); + + sb.Append('b'); + str = sb.c_str(); + CHECK(StringLength(str) == 2); + CHECK(str[0] == 'a'); + CHECK(str[1] == 'b'); + } + + SUBCASE("Append string") + { + sb.Append("a"); + + const char* str = sb.c_str(); + CHECK(StringLength(str) == 1); + CHECK(str[0] == 'a'); + + sb.Append("b"); + str = sb.c_str(); + CHECK(StringLength(str) == 2); + CHECK(str[0] == 'a'); + CHECK(str[1] == 'b'); + + sb.Append("cdefghijklmnopqrstuvwxyz"); + CHECK(sb.Size() == 26); + + sb.Append("abcdefghijklmnopqrstuvwxyz"); + CHECK(sb.Size() == 52); + + sb.Append("abcdefghijk"); + CHECK(sb.Size() == 63); + } +} + +TEST_CASE("ExtendableStringBuilder") +{ + ExtendableStringBuilder<16> sb; + + SUBCASE("Empty init") + { + const char* str = sb.c_str(); + + CHECK(StringLength(str) == 0); + } + + SUBCASE("Short append") + { + sb.Append("abcd"); + CHECK(sb.IsDynamic() == false); + } + + SUBCASE("Short+long append") + { + sb.Append("abcd"); + CHECK(sb.IsDynamic() == false); + // This should trigger a dynamic buffer allocation since the required + // capacity exceeds the internal fixed buffer. + sb.Append("abcdefghijklmnopqrstuvwxyz"); + CHECK(sb.IsDynamic() == true); + CHECK(sb.Size() == 30); + CHECK(sb.Size() == StringLength(sb.c_str())); + } +} + +TEST_CASE("WideStringBuilder") +{ + WideStringBuilder<64> sb; + + SUBCASE("Empty init") + { + const wchar_t* str = sb.c_str(); + + CHECK(StringLength(str) == 0); + } + + SUBCASE("Append single character") + { + sb.Append(L'a'); + + const wchar_t* str = sb.c_str(); + CHECK(StringLength(str) == 1); + CHECK(str[0] == L'a'); + + sb.Append(L'b'); + str = sb.c_str(); + CHECK(StringLength(str) == 2); + CHECK(str[0] == L'a'); + CHECK(str[1] == L'b'); + } + + SUBCASE("Append string") + { + sb.Append(L"a"); + + const wchar_t* str = sb.c_str(); + CHECK(StringLength(str) == 1); + CHECK(str[0] == L'a'); + + sb.Append(L"b"); + str = sb.c_str(); + CHECK(StringLength(str) == 2); + CHECK(str[0] == L'a'); + CHECK(str[1] == L'b'); + + sb.Append(L"cdefghijklmnopqrstuvwxyz"); + CHECK(sb.Size() == 26); + + sb.Append(L"abcdefghijklmnopqrstuvwxyz"); + CHECK(sb.Size() == 52); + + sb.Append(L"abcdefghijk"); + CHECK(sb.Size() == 63); + } +} + +TEST_CASE("ExtendableWideStringBuilder") +{ + ExtendableWideStringBuilder<16> sb; + + SUBCASE("Empty init") + { + CHECK(sb.Size() == 0); + + const wchar_t* str = sb.c_str(); + CHECK(StringLength(str) == 0); + } + + SUBCASE("Short append") + { + sb.Append(L"abcd"); + CHECK(sb.IsDynamic() == false); + } + + SUBCASE("Short+long append") + { + sb.Append(L"abcd"); + CHECK(sb.IsDynamic() == false); + // This should trigger a dynamic buffer allocation since the required + // capacity exceeds the internal fixed buffer. + sb.Append(L"abcdefghijklmnopqrstuvwxyz"); + CHECK(sb.IsDynamic() == true); + CHECK(sb.Size() == 30); + CHECK(sb.Size() == StringLength(sb.c_str())); + } +} + +TEST_CASE("utf8") +{ + SUBCASE("utf8towide") + { + // TODO: add more extensive testing here - this covers a very small space + + WideStringBuilder<32> wout; + Utf8ToWide(u8"abcdefghi", wout); + CHECK(StringEquals(L"abcdefghi", wout.c_str())); + + wout.Reset(); + + Utf8ToWide(u8"abc���", wout); + CHECK(StringEquals(L"abc���", wout.c_str())); + } + + SUBCASE("widetoutf8") + { + // TODO: add more extensive testing here - this covers a very small space + + StringBuilder<32> out; + + WideToUtf8(L"abcdefghi", out); + CHECK(StringEquals("abcdefghi", out.c_str())); + + out.Reset(); + + WideToUtf8(L"abc���", out); + CHECK(StringEquals(u8"abc���", out.c_str())); + } +} + +TEST_CASE("filepath") +{ + CHECK(FilepathFindExtension("foo\\bar\\baz.txt", ".txt") != nullptr); + CHECK(FilepathFindExtension("foo\\bar\\baz.txt", ".zap") == nullptr); + + CHECK(FilepathFindExtension("foo\\bar\\baz.txt") != nullptr); + CHECK(FilepathFindExtension("foo\\bar\\baz.txt") == std::string_view(".txt")); + + CHECK(FilepathFindExtension(".txt") == std::string_view(".txt")); +} + +} // namespace zen diff --git a/zencore/thread.cpp b/zencore/thread.cpp new file mode 100644 index 000000000..80cf6f100 --- /dev/null +++ b/zencore/thread.cpp @@ -0,0 +1,192 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/thread.h> + +#include <zencore/except.h> +#include <zencore/string.h> +#include <zencore/windows.h> +#include <thread> + +namespace zen { + +void +RwLock::AcquireShared() +{ + AcquireSRWLockShared((PSRWLOCK)&m_Srw); +} + +void +RwLock::ReleaseShared() +{ + ReleaseSRWLockShared((PSRWLOCK)&m_Srw); +} + +void +RwLock::AcquireExclusive() +{ + AcquireSRWLockExclusive((PSRWLOCK)&m_Srw); +} + +void +RwLock::ReleaseExclusive() +{ + ReleaseSRWLockExclusive((PSRWLOCK)&m_Srw); +} + +Event::Event() +{ + m_EventHandle = CreateEvent(nullptr, true, false, nullptr); +} + +Event::~Event() +{ + CloseHandle(m_EventHandle); +} + +void +Event::Set() +{ + SetEvent(m_EventHandle); +} + +void +Event::Reset() +{ + ResetEvent(m_EventHandle); +} + +bool +Event::Wait(int TimeoutMs) +{ + const DWORD Timeout = (TimeoutMs < 0) ? INFINITE : TimeoutMs; + + DWORD Result = WaitForSingleObject(m_EventHandle, Timeout); + + if (Result == WAIT_FAILED) + { + throw WindowsException("Event wait failed"); + } + + return (Result == WAIT_OBJECT_0); +} + +NamedEvent::NamedEvent(std::u8string_view EventName) : Event(nullptr) +{ + using namespace std::literals; + + ExtendableStringBuilder<64> Name; + Name << "Local\\"sv; + Name << EventName; + + m_EventHandle = CreateEventA(nullptr, true, false, Name.c_str()); +} + +NamedEvent::NamedEvent(std::string_view EventName) : Event(nullptr) +{ + using namespace std::literals; + + ExtendableStringBuilder<64> Name; + Name << "Local\\"sv; + Name << EventName; + + m_EventHandle = CreateEventA(nullptr, true, false, Name.c_str()); +} + +Process::Process() = default; + +void +Process::Initialize(void* ProcessHandle) +{ + ZEN_ASSERT(m_ProcessHandle == nullptr); + // TODO: perform some debug verification here to verify it's a valid handle? + m_ProcessHandle = ProcessHandle; +} + +Process::~Process() +{ + if (IsValid()) + { + CloseHandle(m_ProcessHandle); + m_ProcessHandle = nullptr; + } +} + +void +Process::Initialize(int Pid) +{ + ZEN_ASSERT(m_ProcessHandle == nullptr); + m_ProcessHandle = OpenProcess(PROCESS_QUERY_INFORMATION, FALSE, Pid); + m_Pid = Pid; +} + +bool +Process::IsRunning() const +{ + DWORD ExitCode = 0; + GetExitCodeProcess(m_ProcessHandle, &ExitCode); + + return ExitCode == STILL_ACTIVE; +} + +bool +Process::IsValid() const +{ + return (m_ProcessHandle != nullptr) && (m_ProcessHandle != INVALID_HANDLE_VALUE); +} + +void +Process::Terminate(int ExitCode) +{ + if (IsRunning()) + { + TerminateProcess(m_ProcessHandle, ExitCode); + } + + DWORD WaitResult = WaitForSingleObject(m_ProcessHandle, INFINITE); + + if (WaitResult != WAIT_OBJECT_0) + { + // What might go wrong here, and what is meaningful to act on? + } +} + +bool +Process::Wait(int TimeoutMs) +{ + const DWORD Timeout = (TimeoutMs < 0) ? INFINITE : TimeoutMs; + + const DWORD WaitResult = WaitForSingleObject(m_ProcessHandle, Timeout); + + switch (WaitResult) + { + case WAIT_OBJECT_0: + return true; + + case WAIT_TIMEOUT: + return false; + + case WAIT_FAILED: + // What might go wrong here, and what is meaningful to act on? + throw WindowsException("Process::Wait failed"); + } + + return false; +} + +void +Sleep(int ms) +{ + ::Sleep(ms); +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +thread_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/timer.cpp b/zencore/timer.cpp new file mode 100644 index 000000000..ee8e1cf9c --- /dev/null +++ b/zencore/timer.cpp @@ -0,0 +1,67 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <doctest/doctest.h> +#include <zencore/thread.h> +#include <zencore/timer.h> +#include <zencore/windows.h> + +namespace zen { + +uint64_t +GetHifreqTimerValue() +{ + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + + return li.QuadPart; +} + +uint64_t +internalGetHifreqTimerFrequency() +{ + LARGE_INTEGER li; + QueryPerformanceFrequency(&li); + + return li.QuadPart; +} + +static uint64_t qpcFreq = internalGetHifreqTimerFrequency(); + +uint64_t +GetHifreqTimerFrequency() +{ + return qpcFreq; +} + +uint64_t +GetHifreqTimerFrequencySafe() +{ + if (!qpcFreq) + qpcFreq = internalGetHifreqTimerFrequency(); + + return qpcFreq; +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +timer_forcelink() +{ +} + +TEST_CASE("Timer") +{ + uint64_t s0 = GetHifreqTimerValue(); + uint64_t t0 = GetCpuTimerValue(); + Sleep(1000); + uint64_t s1 = GetHifreqTimerValue(); + uint64_t t1 = GetCpuTimerValue(); + // double r = double(t1 - t0) / (s1 - s0); + CHECK_NE(t0, t1); + CHECK_NE(s0, s1); +} + +} // namespace zen diff --git a/zencore/trace.cpp b/zencore/trace.cpp new file mode 100644 index 000000000..8313b3b66 --- /dev/null +++ b/zencore/trace.cpp @@ -0,0 +1,51 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zencore/trace.h" +#include <doctest/doctest.h> +#include <zencore/windows.h> + +namespace zen { + +void +Tracer::Log(const TraceEvent& e) +{ + TraceBroadcast(e); +} + +Tracer g_globalTracer; + +struct alignas(64) TraceHandlerList +{ + enum + { + kMaxHandlers = 7 + }; + + uint8_t handlerCount = 0; + TraceHandler* handlers[kMaxHandlers]; +}; + +static TraceHandlerList g_traceHandlers; + +void +TraceBroadcast(const TraceEvent& e) +{ + for (size_t i = 0; i < g_traceHandlers.handlerCount; ++i) + { + g_traceHandlers.handlers[i]->Trace(e); + } +} + +void +trace_forcelink() +{ +} + +////////////////////////////////////////////////////////////////////////// + +TEST_CASE("Tracer") +{ + SUBCASE("Simple") { U_LOG_INFO("bajs"); } +} + +} // namespace zen diff --git a/zencore/uid.cpp b/zencore/uid.cpp new file mode 100644 index 000000000..9506b305c --- /dev/null +++ b/zencore/uid.cpp @@ -0,0 +1,196 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/uid.h> + +#include <zencore/string.h> + +#if _WIN32 +# include <zencore/windows.h> +# include <bcrypt.h> +# pragma comment(lib, "bcrypt.lib") +#endif + +#include <atomic> +#include <bit> +#include <set> +#include <unordered_map> + +#include <doctest/doctest.h> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +template<typename T> +T +EndianSwap(T value) +{ + uint8_t dest[sizeof value]; + + memcpy(dest, &value, sizeof value); + + for (int i = 0; i < sizeof(value); i++) + { + uint8_t& other = dest[sizeof(value) - i - 1]; + + uint8_t temp = dest[i]; + dest[i] = other; + other = temp; + } + + T ret; + + memcpy(&ret, &value, sizeof value); + + return ret; +} + +#if _WIN32 +__forceinline uint16_t +EndianSwap(uint16_t value) +{ + return _byteswap_ushort(value); +} + +__forceinline uint32_t +EndianSwap(uint32_t value) +{ + return _byteswap_ulong(value); +} + +__forceinline uint64_t +EndianSwap(uint64_t value) +{ + return _byteswap_uint64(value); +} +#endif + +////////////////////////////////////////////////////////////////////////// + +namespace detail { + static bool OidInitialised; + static uint32_t RunId; + static std::atomic_uint32_t Serial; + + // Number of 100 nanosecond units from 1/1/1601 to 1/1/1970 - used for Windows impl + constexpr int64_t kEpochBias = 116'444'736'000'000'000ull; +} // namespace detail + +////////////////////////////////////////////////////////////////////////// + +const Oid Oid::Zero = {{0u, 0u, 0u}}; +const Oid Oid::Max = {{~0u, ~0u, ~0u}}; + +void +Oid::Initialize() +{ + using namespace detail; + + if (OidInitialised) + return; + + OidInitialised = true; + +#if _WIN32 + char rng[8]; + BCryptGenRandom(NULL, (PUCHAR)rng, sizeof rng, BCRYPT_USE_SYSTEM_PREFERRED_RNG); + + memcpy(&RunId, &rng[0], sizeof(RunId)); + memcpy((void*)&Serial, &rng[4], sizeof(Serial)); +#else +# error Must implement Oid::Initialize +#endif +} + +const Oid& +Oid::Generate() +{ + using namespace detail; + + if (!OidInitialised) + { + Oid::Initialize(); + } + +#if _WIN32 + FILETIME filetime; + + GetSystemTimeAsFileTime(&filetime); // Time is UTC + + uint64_t filetime64; + memcpy(&filetime64, &filetime, sizeof filetime); + + OidBits[0] = EndianSwap(uint32_t((filetime64 - kEpochBias) / 10'000'000l)); + OidBits[1] = EndianSwap(uint32_t(Serial++)); + OidBits[2] = RunId; +#else +# error Must implement Oid::Generate +#endif + + return *this; +} + +Oid +Oid::NewOid() +{ + return Oid().Generate(); +} + +Oid +Oid::FromHexString(const std::string_view String) +{ + ZEN_ASSERT(String.size() == 2 * sizeof(Oid::OidBits)); + + Oid Id; + + ParseHexBytes(String.data(), String.size(), reinterpret_cast<uint8_t*>(Id.OidBits)); + + return Id; +} + +StringBuilderBase& +Oid::ToString(StringBuilderBase& OutString) const +{ + char str[25]; + ToHexBytes(reinterpret_cast<const uint8_t*>(OidBits), sizeof(Oid::OidBits), str); + str[2 * sizeof(Oid)] = '\0'; + + OutString.AppendRange(str, &str[25]); + + return OutString; +} + +TEST_CASE("Oid") +{ + SUBCASE("Basic") + { + Oid id1 = Oid::NewOid(); + + std::vector<Oid> ids; + std::set<Oid> idset; + std::unordered_map<Oid, int, Oid::Hasher> idmap; + + const int Count = 1000; + + for (int i = 0; i < Count; ++i) + { + Oid id; + id.Generate(); + + ids.emplace_back(id); + idset.insert(id); + idmap.insert({id, i}); + } + + CHECK(ids.size() == Count); + CHECK(idset.size() == Count); // All ids should be unique + CHECK(idmap.size() == Count); // Ditto + } +} + +void +uid_forcelink() +{ +} + +} // namespace zen diff --git a/zencore/xxhash.cpp b/zencore/xxhash.cpp new file mode 100644 index 000000000..a20ee10bd --- /dev/null +++ b/zencore/xxhash.cpp @@ -0,0 +1,50 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/xxhash.h> + +#include <zencore/string.h> + +#include <doctest/doctest.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +XXH3_128 XXH3_128::Zero; // Initialized to all zeros + +XXH3_128 +XXH3_128::FromHexString(const char* InString) +{ + return FromHexString({InString, sizeof(XXH3_128::Hash) * 2}); +} + +XXH3_128 +XXH3_128::FromHexString(std::string_view InString) +{ + ZEN_ASSERT(InString.size() == 2 * sizeof(XXH3_128::Hash)); + + XXH3_128 Xx; + ParseHexBytes(InString.data(), InString.size(), Xx.Hash); + return Xx; +} + +const char* +XXH3_128::ToHexString(char* OutString /* 40 characters + NUL terminator */) const +{ + ToHexBytes(Hash, sizeof(XXH3_128), OutString); + OutString[2 * sizeof(XXH3_128)] = '\0'; + + return OutString; +} + +StringBuilderBase& +XXH3_128::ToHexString(StringBuilderBase& OutBuilder) const +{ + String_t str; + ToHexString(str); + + OutBuilder.AppendRange(str, &str[StringLength]); + + return OutBuilder; +} + +} // namespace zen diff --git a/zencore/zencore.cpp b/zencore/zencore.cpp new file mode 100644 index 000000000..d4853b043 --- /dev/null +++ b/zencore/zencore.cpp @@ -0,0 +1,70 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/zencore.h> + +#include <zencore/windows.h> + +#include <zencore/blake3.h> +#include <zencore/compactbinary.h> +#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinarypackage.h> +#include <zencore/iobuffer.h> +#include <zencore/memory.h> +#include <zencore/refcount.h> +#include <zencore/sha1.h> +#include <zencore/snapshot_manifest.h> +#include <zencore/stats.h> +#include <zencore/stream.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> +#include <zencore/trace.h> +#include <zencore/uid.h> + +bool +IsPointerToStack(const void* ptr) +{ + ULONG_PTR low, high; + GetCurrentThreadStackLimits(&low, &high); + + const uintptr_t intPtr = reinterpret_cast<uintptr_t>(ptr); + + return (intPtr - low) < (high - low); +} + +static int s_ApplicationExitCode = 0; +static bool s_ApplicationExitRequested; + +bool +IsApplicationExitRequested() +{ + return s_ApplicationExitRequested; +} + +void +RequestApplicationExit(int ExitCode) +{ + s_ApplicationExitCode = ExitCode; + s_ApplicationExitRequested = true; +} + +void +zencore_forcelinktests() +{ + zen::sha1_forcelink(); + zen::blake3_forcelink(); + zen::trace_forcelink(); + zen::timer_forcelink(); + zen::uid_forcelink(); + zen::string_forcelink(); + zen::thread_forcelink(); + zen::stream_forcelink(); + zen::refcount_forcelink(); + zen::snapshotmanifest_forcelink(); + zen::iobuffer_forcelink(); + zen::stats_forcelink(); + zen::uson_forcelink(); + zen::usonbuilder_forcelink(); + zen::usonpackage_forcelink(); + zen::memory_forcelink(); +} diff --git a/zencore/zencore.vcxproj b/zencore/zencore.vcxproj new file mode 100644 index 000000000..c68e922c5 --- /dev/null +++ b/zencore/zencore.vcxproj @@ -0,0 +1,188 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>15.0</VCProjectVersion> + <ProjectGuid>{D75BF9AB-C61E-4FFF-AD59-1563430F05E2}</ProjectGuid> + <Keyword>Win32Proj</Keyword> + <RootNamespace>zencore</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>false</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_release.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + <EnableMicrosoftCodeAnalysis>false</EnableMicrosoftCodeAnalysis> + <EnableClangTidyCodeAnalysis>true</EnableClangTidyCodeAnalysis> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>Disabled</Optimization> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>.\include;..\3rdparty\utfcpp\source</AdditionalIncludeDirectories> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + <LanguageStandard>stdcpplatest</LanguageStandard> + <TreatWarningAsError>true</TreatWarningAsError> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + <Lib /> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>MaxSpeed</Optimization> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>.\include;..\3rdparty\utfcpp\source</AdditionalIncludeDirectories> + <WholeProgramOptimization>false</WholeProgramOptimization> + <LanguageStandard>stdcpplatest</LanguageStandard> + <TreatWarningAsError>true</TreatWarningAsError> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + <Lib> + <Verbose>true</Verbose> + </Lib> + </ItemDefinitionGroup> + <ItemGroup> + <ClInclude Include="include\zencore\atomic.h" /> + <ClInclude Include="include\zencore\blake3.h" /> + <ClInclude Include="include\zencore\enumflags.h" /> + <ClInclude Include="include\zencore\except.h" /> + <ClInclude Include="include\zencore\compress.h" /> + <ClInclude Include="include\zencore\filesystem.h" /> + <ClInclude Include="include\zencore\fmtutils.h" /> + <ClInclude Include="include\zencore\httpclient.h" /> + <ClInclude Include="include\zencore\httpserver.h" /> + <ClInclude Include="include\zencore\intmath.h" /> + <ClInclude Include="include\zencore\iohash.h" /> + <ClInclude Include="include\zencore\md5.h" /> + <ClInclude Include="include\zencore\memory.h" /> + <ClInclude Include="include\zencore\refcount.h" /> + <ClInclude Include="include\zencore\scopeguard.h" /> + <ClInclude Include="include\zencore\sha1.h" /> + <ClInclude Include="include\zencore\iobuffer.h" /> + <ClInclude Include="include\zencore\sharedbuffer.h" /> + <ClInclude Include="include\zencore\snapshot_manifest.h" /> + <ClInclude Include="include\zencore\stats.h" /> + <ClInclude Include="include\zencore\stream.h" /> + <ClInclude Include="include\zencore\streamutil.h" /> + <ClInclude Include="include\zencore\string.h" /> + <ClInclude Include="include\zencore\targetver.h" /> + <ClInclude Include="include\zencore\thread.h" /> + <ClInclude Include="include\zencore\timer.h" /> + <ClInclude Include="include\zencore\trace.h" /> + <ClInclude Include="include\zencore\uid.h" /> + <ClInclude Include="include\zencore\compactbinary.h" /> + <ClInclude Include="include\zencore\compactbinarybuilder.h" /> + <ClInclude Include="include\zencore\compactbinarypackage.h" /> + <ClInclude Include="include\zencore\compactbinaryvalidation.h" /> + <ClInclude Include="include\zencore\varint.h" /> + <ClInclude Include="include\zencore\windows.h" /> + <ClInclude Include="include\zencore\xxhash.h" /> + <ClInclude Include="include\zencore\zencore.h" /> + <ClInclude Include="iothreadpool.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="blake3.cpp" /> + <ClCompile Include="compress.cpp" /> + <ClCompile Include="except.cpp" /> + <ClCompile Include="filesystem.cpp" /> + <ClCompile Include="httpclient.cpp" /> + <ClCompile Include="httpserver.cpp" /> + <ClCompile Include="iohash.cpp" /> + <ClCompile Include="iothreadpool.cpp" /> + <ClCompile Include="md5.cpp" /> + <ClCompile Include="memory.cpp" /> + <ClCompile Include="refcount.cpp" /> + <ClCompile Include="sha1.cpp"> + <Optimization Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MaxSpeed</Optimization> + <InlineFunctionExpansion Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AnySuitable</InlineFunctionExpansion> + <IntrinsicFunctions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</IntrinsicFunctions> + <BasicRuntimeChecks Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Default</BasicRuntimeChecks> + </ClCompile> + <ClCompile Include="iobuffer.cpp" /> + <ClCompile Include="sharedbuffer.cpp" /> + <ClCompile Include="snapshot_manifest.cpp" /> + <ClCompile Include="stats.cpp" /> + <ClCompile Include="stream.cpp" /> + <ClCompile Include="streamutil.cpp" /> + <ClCompile Include="string.cpp" /> + <ClCompile Include="thread.cpp" /> + <ClCompile Include="timer.cpp" /> + <ClCompile Include="trace.cpp" /> + <ClCompile Include="uid.cpp" /> + <ClCompile Include="compactbinary.cpp" /> + <ClCompile Include="compactbinarybuilder.cpp" /> + <ClCompile Include="compactbinarypackage.cpp" /> + <ClCompile Include="compactbinaryvalidation.cpp" /> + <ClCompile Include="xxhash.cpp" /> + <ClCompile Include="zencore.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zencore/zencore.vcxproj.filters b/zencore/zencore.vcxproj.filters new file mode 100644 index 000000000..c25f99e77 --- /dev/null +++ b/zencore/zencore.vcxproj.filters @@ -0,0 +1,78 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClInclude Include="include\zencore\intmath.h" /> + <ClInclude Include="include\zencore\scopeguard.h" /> + <ClInclude Include="include\zencore\sha1.h" /> + <ClInclude Include="include\zencore\snapshot_manifest.h" /> + <ClInclude Include="include\zencore\targetver.h" /> + <ClInclude Include="include\zencore\zencore.h" /> + <ClInclude Include="include\zencore\compactbinary.h" /> + <ClInclude Include="include\zencore\uid.h" /> + <ClInclude Include="include\zencore\trace.h" /> + <ClInclude Include="include\zencore\compress.h" /> + <ClInclude Include="include\zencore\timer.h" /> + <ClInclude Include="include\zencore\thread.h" /> + <ClInclude Include="include\zencore\string.h" /> + <ClInclude Include="include\zencore\streamutil.h" /> + <ClInclude Include="include\zencore\stream.h" /> + <ClInclude Include="include\zencore\stats.h" /> + <ClInclude Include="include\zencore\blake3.h" /> + <ClInclude Include="include\zencore\atomic.h" /> + <ClInclude Include="include\zencore\enumflags.h" /> + <ClInclude Include="include\zencore\except.h" /> + <ClInclude Include="include\zencore\filesystem.h" /> + <ClInclude Include="include\zencore\httpserver.h" /> + <ClInclude Include="include\zencore\refcount.h" /> + <ClInclude Include="include\zencore\memory.h" /> + <ClInclude Include="include\zencore\windows.h" /> + <ClInclude Include="include\zencore\iobuffer.h" /> + <ClInclude Include="include\zencore\sharedbuffer.h" /> + <ClInclude Include="include\zencore\iohash.h" /> + <ClInclude Include="include\zencore\compactbinarybuilder.h" /> + <ClInclude Include="include\zencore\compactbinarypackage.h" /> + <ClInclude Include="include\zencore\compactbinaryvalidation.h" /> + <ClInclude Include="include\zencore\httpclient.h" /> + <ClInclude Include="include\zencore\md5.h" /> + <ClInclude Include="include\zencore\fmtutils.h" /> + <ClInclude Include="include\zencore\xxhash.h" /> + <ClInclude Include="iothreadpool.h" /> + <ClInclude Include="include\zencore\varint.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="snapshot_manifest.cpp" /> + <ClCompile Include="sha1.cpp" /> + <ClCompile Include="zencore.cpp" /> + <ClCompile Include="compactbinary.cpp" /> + <ClCompile Include="uid.cpp" /> + <ClCompile Include="blake3.cpp" /> + <ClCompile Include="filesystem.cpp" /> + <ClCompile Include="httpserver.cpp" /> + <ClCompile Include="memory.cpp" /> + <ClCompile Include="refcount.cpp" /> + <ClCompile Include="stats.cpp" /> + <ClCompile Include="stream.cpp" /> + <ClCompile Include="streamutil.cpp" /> + <ClCompile Include="string.cpp" /> + <ClCompile Include="thread.cpp" /> + <ClCompile Include="timer.cpp" /> + <ClCompile Include="trace.cpp" /> + <ClCompile Include="iobuffer.cpp" /> + <ClCompile Include="sharedbuffer.cpp" /> + <ClCompile Include="iohash.cpp" /> + <ClCompile Include="compactbinarybuilder.cpp" /> + <ClCompile Include="compactbinarypackage.cpp" /> + <ClCompile Include="compactbinaryvalidation.cpp" /> + <ClCompile Include="httpclient.cpp" /> + <ClCompile Include="md5.cpp" /> + <ClCompile Include="except.cpp" /> + <ClCompile Include="xxhash.cpp" /> + <ClCompile Include="iothreadpool.cpp" /> + <ClCompile Include="compress.cpp" /> + </ItemGroup> + <ItemGroup> + <Filter Include="CAS"> + <UniqueIdentifier>{af5266fa-37a5-494c-9116-b15a3e6edd29}</UniqueIdentifier> + </Filter> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zenfs_common.props b/zenfs_common.props new file mode 100644 index 000000000..e01612b41 --- /dev/null +++ b/zenfs_common.props @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ImportGroup Label="PropertySheets" /> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(SolutionDir)\3rdparty\BLAKE3\lib\Win64</LibraryPath> + </PropertyGroup> + <ItemDefinitionGroup> + <ClCompile> + <LanguageStandard>stdcpplatest</LanguageStandard> + <WarningLevel>Level4</WarningLevel> + <MultiProcessorCompilation>true</MultiProcessorCompilation> + <TreatWarningAsError>true</TreatWarningAsError> + </ClCompile> + </ItemDefinitionGroup> + <ItemGroup /> +</Project>
\ No newline at end of file diff --git a/zenserver-test/projectclient.cpp b/zenserver-test/projectclient.cpp new file mode 100644 index 000000000..8af1a3b73 --- /dev/null +++ b/zenserver-test/projectclient.cpp @@ -0,0 +1,158 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "projectclient.h" + +#include <zencore/compactbinary.h> +#include <zencore/sharedbuffer.h> +#include <zencore/string.h> +#include <zencore/zencore.h> + +#include <spdlog/spdlog.h> +#include <asio.hpp> +#include <gsl/gsl-lite.hpp> + +#include <atlbase.h> + +namespace zen { + +using namespace fmt::literals; + +struct ProjectClientConnection +{ + ProjectClientConnection(int BasePort) { Connect(BasePort); } + + void Connect(int BasePort) + { + WideStringBuilder<64> PipeName; + PipeName << "\\\\.\\pipe\\zenprj"; // TODO: this should use an instance-specific identifier! + + HANDLE hPipe = CreateFileW(PipeName.c_str(), + GENERIC_READ | GENERIC_WRITE, + 0, // Sharing doesn't make any sense + nullptr, // No security attributes + OPEN_EXISTING, // Open existing pipe + 0, // Attributes + nullptr // Template file + ); + + if (hPipe == INVALID_HANDLE_VALUE) + { + spdlog::warn("failed while creating named pipe {}", WideToUtf8(PipeName)); + + throw std::system_error(GetLastError(), std::system_category(), "Failed to open named pipe '{}'"_format(WideToUtf8(PipeName))); + } + + // Change to message mode + DWORD dwMode = PIPE_READMODE_MESSAGE; + BOOL Success = SetNamedPipeHandleState(hPipe, &dwMode, nullptr, nullptr); + + if (!Success) + { + throw std::system_error(GetLastError(), + std::system_category(), + "Failed to change named pipe '{}' to message mode"_format(WideToUtf8(PipeName))); + } + + m_hPipe.Attach(hPipe); // This now owns the handle and will close it + } + + ~ProjectClientConnection() {} + + CbObject MessageTransaction(CbObject Request) + { + DWORD dwWrittenBytes = 0; + + MemoryView View = Request.GetView(); + + BOOL Success = ::WriteFile(m_hPipe, View.GetData(), gsl::narrow_cast<DWORD>(View.GetSize()), &dwWrittenBytes, nullptr); + + if (!Success) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write pipe message"); + } + + ZEN_ASSERT(dwWrittenBytes == View.GetSize()); + + DWORD dwReadBytes = 0; + + Success = ReadFile(m_hPipe, m_Buffer, sizeof m_Buffer, &dwReadBytes, nullptr); + + if (!Success) + { + DWORD ErrorCode = GetLastError(); + + if (ERROR_MORE_DATA == ErrorCode) + { + // Response message is larger than our buffer - handle it by allocating a larger + // buffer on the heap and read the remainder into that buffer + + DWORD dwBytesAvail = 0, dwLeftThisMessage = 0; + + Success = PeekNamedPipe(m_hPipe, nullptr, 0, nullptr, &dwBytesAvail, &dwLeftThisMessage); + + if (Success) + { + UniqueBuffer MessageBuffer = UniqueBuffer::Alloc(dwReadBytes + dwLeftThisMessage); + + memcpy(MessageBuffer.GetData(), m_Buffer, dwReadBytes); + + Success = ReadFile(m_hPipe, + reinterpret_cast<uint8_t*>(MessageBuffer.GetData()) + dwReadBytes, + dwLeftThisMessage, + &dwReadBytes, + nullptr); + + if (Success) + { + return CbObject(SharedBuffer(std::move(MessageBuffer))); + } + } + } + + throw std::system_error(GetLastError(), std::system_category(), "Failed to read pipe message"); + } + + return CbObject(SharedBuffer::MakeView(MakeMemoryView(m_Buffer))); + } + +private: + static const int kEmbeddedBufferSize = 512 - 16; + + CHandle m_hPipe; + uint8_t m_Buffer[kEmbeddedBufferSize]; +}; + +struct LocalProjectClient::ClientImpl +{ + ClientImpl(int BasePort) : m_BasePort(BasePort) {} + ~ClientImpl() {} + + void Start() {} + void Stop() {} + + inline int BasePort() const { return m_BasePort; } + +private: + int m_BasePort = 0; +}; + +LocalProjectClient::LocalProjectClient(int BasePort) +{ + m_Impl = std::make_unique<ClientImpl>(BasePort); + m_Impl->Start(); +} + +LocalProjectClient::~LocalProjectClient() +{ + m_Impl->Stop(); +} + +CbObject +LocalProjectClient::MessageTransaction(CbObject Request) +{ + ProjectClientConnection Cx(m_Impl->BasePort()); + + return Cx.MessageTransaction(Request); +} + +} // namespace zen diff --git a/zenserver-test/projectclient.h b/zenserver-test/projectclient.h new file mode 100644 index 000000000..1865dd67d --- /dev/null +++ b/zenserver-test/projectclient.h @@ -0,0 +1,32 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <memory> + +#include <zencore/compactbinary.h> +#include <zencore/refcount.h> + +namespace zen { + +/** + * Client for communication with local project service + * + * This is WIP and not yet functional! + */ + +class LocalProjectClient : public RefCounted +{ +public: + LocalProjectClient(int BasePort = 0); + ~LocalProjectClient(); + + CbObject MessageTransaction(CbObject Request); + +private: + struct ClientImpl; + + std::unique_ptr<ClientImpl> m_Impl; +}; + +} // namespace zen diff --git a/zenserver-test/zenserver-test.cpp b/zenserver-test/zenserver-test.cpp new file mode 100644 index 000000000..cb4ff06da --- /dev/null +++ b/zenserver-test/zenserver-test.cpp @@ -0,0 +1,1092 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#define _SILENCE_CXX17_C_HEADER_DEPRECATION_WARNING + +#include <zencore/compactbinary.h> +#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinarypackage.h> +#include <zencore/except.h> +#include <zencore/filesystem.h> +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> +#include <zencore/trace.h> +#include <zenserverprocess.h> + +#include <mimalloc.h> + +#include <http_parser.h> + +#if ZEN_PLATFORM_WINDOWS +# pragma comment(lib, "Crypt32.lib") +# pragma comment(lib, "Wldap32.lib") +#endif +#include <cpr/cpr.h> + +#include <spdlog/spdlog.h> + +#include <ppl.h> +#include <atomic> +#include <filesystem> +#include <map> +#include <random> + +#include <atlbase.h> +#include <process.h> + +#include <asio.hpp> + +////////////////////////////////////////////////////////////////////////// + +#include "projectclient.h" + +////////////////////////////////////////////////////////////////////////// + +#define DOCTEST_CONFIG_IMPLEMENT +#include <doctest/doctest.h> +#undef DOCTEST_CONFIG_IMPLEMENT + +using namespace fmt::literals; + +/* + +___ ___ _________ _________ ________ ________ ___ ___ _______ ________ _________ +|\ \|\ \|\___ ___\\___ ___\\ __ \ |\ ____\|\ \ |\ \|\ ___ \ |\ ___ \|\___ ___\ +\ \ \\\ \|___ \ \_\|___ \ \_\ \ \|\ \ \ \ \___|\ \ \ \ \ \ \ __/|\ \ \\ \ \|___ \ \_| + \ \ __ \ \ \ \ \ \ \ \ \ ____\ \ \ \ \ \ \ \ \ \ \ \_|/_\ \ \\ \ \ \ \ \ + \ \ \ \ \ \ \ \ \ \ \ \ \ \___| \ \ \____\ \ \____\ \ \ \ \_|\ \ \ \\ \ \ \ \ \ + \ \__\ \__\ \ \__\ \ \__\ \ \__\ \ \_______\ \_______\ \__\ \_______\ \__\\ \__\ \ \__\ + \|__|\|__| \|__| \|__| \|__| \|_______|\|_______|\|__|\|_______|\|__| \|__| \|__| + +*/ + +class HttpConnectionPool; + +/** + * Http client connection + * + * Represents an established socket connection to a certain endpoint + */ + +class HttpClientConnection +{ + static HttpClientConnection* This(http_parser* Parser) { return (HttpClientConnection*)Parser->data; }; + +public: + HttpClientConnection(asio::io_context& IoContext, HttpConnectionPool& Pool, asio::ip::tcp::socket&& InSocket) + : m_IoContext(IoContext) + , m_Pool(Pool) + , m_Resolver(IoContext) + , m_Socket(std::move(InSocket)) + { + } + ~HttpClientConnection() {} + + HttpConnectionPool& ConnectionPool() { return m_Pool; } + void SetKeepAlive(bool NewState) { m_KeepAlive = NewState; } + + void Get(const std::string_view Server, int Port, const std::string_view Path) + { + http_parser_init(&m_HttpParser, HTTP_RESPONSE); + m_HttpParser.data = this; + + m_HttpParserSettings = http_parser_settings{ + .on_message_begin = [](http_parser* p) -> int { return This(p)->OnMessageBegin(); }, + .on_url = nullptr, + .on_status = nullptr, + .on_header_field = [](http_parser* p, const char* data, size_t size) { return This(p)->OnHeader(data, size); }, + .on_header_value = [](http_parser* p, const char* data, size_t size) { return This(p)->OnHeaderValue(data, size); }, + .on_headers_complete = [](http_parser* p) -> int { return This(p)->OnHeadersComplete(); }, + .on_body = [](http_parser* p, const char* data, size_t size) { return This(p)->OnBody(data, size); }, + .on_message_complete = [](http_parser* p) -> int { return This(p)->OnMessageComplete(); }, + .on_chunk_header = nullptr, + .on_chunk_complete = nullptr}; + + m_Headers.reserve(16); + + zen::ExtendableStringBuilder<256> RequestBody; + RequestBody << "GET " << Path << " HTTP/1.1\r\n"; + RequestBody << "Host: " << Server << "\r\n"; + RequestBody << "Accept: */*\r\n"; + RequestBody << "Connection: " << (m_KeepAlive ? "keep-alive" : "close") << "\r\n\r\n"; // TODO: support keep-alive + + m_RequestBody = RequestBody; + + OnConnected(); + } + +private: + void Reset() {} + + void OnError(const std::error_code& Error) { spdlog::error("HTTP client error! '{}'", Error.message()); } + + int OnHeader(const char* Data, size_t Bytes) + { + m_CurrentHeaderName = std::string_view(Data, Bytes); + return 0; + } + int OnHeaderValue(const char* Data, size_t Bytes) + { + m_Headers.emplace_back(HeaderEntry{m_CurrentHeaderName, {Data, Bytes}}); + return 0; + } + int OnHeadersComplete() + { + spdlog::debug("Headers complete"); + return 0; + } + int OnMessageComplete() + { + if (http_should_keep_alive(&m_HttpParser)) + { + Reset(); + } + else + { + m_Socket.close(); + m_RequestState = RequestState::Done; + } + return 0; + } + int OnMessageBegin() { return 0; } + int OnBody(const char* Data, size_t Bytes) { return 0; } + + void OnConnected() + { + // Send initial request payload + asio::async_write(m_Socket, + asio::const_buffer(m_RequestBody.data(), m_RequestBody.size()), + [this](const std::error_code& Error, size_t Bytes) { + if (Error) + { + return OnError(Error); + } + + OnRequestWritten(); + }); + } + + void OnRequestWritten() + { + asio::async_read(m_Socket, m_ResponseBuffer, asio::transfer_at_least(1), [this](const std::error_code& Error, size_t Bytes) { + if (Error) + { + return OnError(Error); + } + + OnStatusLineRead(Bytes); + }); + } + + void OnStatusLineRead(size_t Bytes) + { + // Parse + + size_t rv = http_parser_execute(&m_HttpParser, &m_HttpParserSettings, (const char*)m_ResponseBuffer.data(), Bytes); + + if (m_HttpParser.http_errno != 0) + { + // Something bad! + + spdlog::error("parse error {}", (uint32_t)m_HttpParser.http_errno); + } + + switch (m_RequestState) + { + case RequestState::Init: + asio::async_read(m_Socket, + m_ResponseBuffer, + asio::transfer_at_least(1), + [this](const std::error_code& Error, size_t Bytes) { + if (Error) + { + return OnError(Error); + } + OnStatusLineRead(Bytes); + }); + return; + case RequestState::Done: + break; + } + } + +private: + asio::io_context& m_IoContext; + HttpConnectionPool& m_Pool; + asio::ip::tcp::resolver m_Resolver; + asio::ip::tcp::socket m_Socket; + std::string m_Uri; + std::string m_RequestBody; // Initial request data + http_parser m_HttpParser{}; + http_parser_settings m_HttpParserSettings{}; + uint8_t m_ResponseIoBuffer[4096]; + asio::mutable_buffer m_ResponseBuffer{m_ResponseIoBuffer, sizeof m_ResponseIoBuffer}; + + enum class RequestState + { + Init, + Done + }; + + RequestState m_RequestState = RequestState::Init; + + struct HeaderEntry + { + std::string_view Name; + std::string_view Value; + }; + + std::string_view m_CurrentHeaderName; // Used while parsing headers + std::vector<HeaderEntry> m_Headers; + bool m_KeepAlive = false; +}; + +////////////////////////////////////////////////////////////////////////// + +class HttpConnectionPool +{ +public: + HttpConnectionPool(asio::io_context& Context, std::string_view HostName, uint16_t Port); + ~HttpConnectionPool(); + + std::unique_ptr<HttpClientConnection> GetConnection(); + void ReturnConnection(std::unique_ptr<HttpClientConnection>&& Connection); + +private: + zen::RwLock m_Lock; + asio::io_context& m_Context; + std::vector<HttpClientConnection*> m_AvailableConnections; + std::string m_HostName; + uint16_t m_Port; +}; + +HttpConnectionPool::HttpConnectionPool(asio::io_context& Context, std::string_view HostName, uint16_t Port) +: m_Context(Context) +, m_HostName(HostName) +, m_Port(Port) +{ +} + +HttpConnectionPool::~HttpConnectionPool() +{ + zen::RwLock::ExclusiveLockScope ScopedLock(m_Lock); + + for (auto $ : m_AvailableConnections) + { + delete $; + } +} + +std::unique_ptr<HttpClientConnection> +HttpConnectionPool::GetConnection() +{ + zen::RwLock::ExclusiveLockScope ScopedLock(m_Lock); + + if (m_AvailableConnections.empty()) + { + zen::StringBuilder<16> Service; + Service << int64_t(m_Port); + + asio::ip::tcp::resolver Resolver{m_Context}; + + std::error_code ErrCode; + auto it = Resolver.resolve(m_HostName, Service, ErrCode); + auto itEnd = asio::ip::tcp::resolver::iterator(); + + if (ErrCode) + { + return nullptr; + } + + asio::ip::tcp::socket Socket{m_Context}; + asio::connect(Socket, it, ErrCode); + + if (ErrCode) + { + return nullptr; + } + + return std::make_unique<HttpClientConnection>(m_Context, *this, std::move(Socket)); + } + + std::unique_ptr<HttpClientConnection> Connection{m_AvailableConnections.back()}; + m_AvailableConnections.pop_back(); + return std::move(Connection); +} + +void +HttpConnectionPool::ReturnConnection(std::unique_ptr<HttpClientConnection>&& Connection) +{ + zen::RwLock::ExclusiveLockScope ScopedLock(m_Lock); + m_AvailableConnections.emplace_back(Connection.release()); +} + +////////////////////////////////////////////////////////////////////////// + +class HttpContext +{ +public: + HttpContext(asio::io_context& Context) : m_Context(Context) {} + ~HttpContext() = default; + + std::unique_ptr<HttpClientConnection> GetConnection(std::string_view HostName, uint16_t Port) + { + return ConnectionPool(HostName, Port).GetConnection(); + } + + void ReturnConnection(std::unique_ptr<HttpClientConnection> Connection) + { + Connection->ConnectionPool().ReturnConnection(std::move(Connection)); + } + + HttpConnectionPool& ConnectionPool(std::string_view HostName, uint16_t Port) + { + zen::RwLock::ExclusiveLockScope _(m_Lock); + ConnectionId ConnId{std::string(HostName), Port}; + + if (auto It = m_ConnectionPools.find(ConnId); It == end(m_ConnectionPools)) + { + // Not found - create new entry + + auto In = m_ConnectionPools.insert({ConnId, std::move(HttpConnectionPool(m_Context, HostName, Port))}); + + return In.first->second; + } + else + { + return It->second; + } + } + +private: + asio::io_context& m_Context; + + struct ConnectionId + { + inline bool operator<(const ConnectionId& Rhs) const + { + if (HostName != Rhs.HostName) + { + return HostName < Rhs.HostName; + } + + return Port < Rhs.Port; + } + + std::string HostName; + uint16_t Port; + }; + + zen::RwLock m_Lock; + std::map<ConnectionId, HttpConnectionPool> m_ConnectionPools; +}; + +////////////////////////////////////////////////////////////////////////// + +class HttpClientRequest +{ +public: + HttpClientRequest(HttpContext& Context) : m_HttpContext(Context) {} + ~HttpClientRequest() + { + if (m_Connection) + { + m_HttpContext.ReturnConnection(std::move(m_Connection)); + } + } + + void Get(const std::string_view Url) + { + http_parser_url ParsedUrl; + int ErrCode = http_parser_parse_url(Url.data(), Url.size(), 0, &ParsedUrl); + + if (ErrCode) + { + ZEN_NOT_IMPLEMENTED(); + } + + if ((ParsedUrl.field_set & (UF_HOST | UF_PORT | UF_PATH)) != (UF_HOST | UF_PORT | UF_PATH)) + { + // Bad URL + } + + std::string_view HostName(Url.data() + ParsedUrl.field_data[UF_HOST].off, ParsedUrl.field_data[UF_HOST].len); + std::string_view Path(Url.data() + ParsedUrl.field_data[UF_PATH].off); + + m_Connection = m_HttpContext.GetConnection(HostName, ParsedUrl.port); + m_Connection->Get(HostName, ParsedUrl.port, Path); + } + +private: + HttpContext& m_HttpContext; + std::unique_ptr<HttpClientConnection> m_Connection; +}; + +////////////////////////////////////////////////////////////////////////// +// +// Custom logging -- test code, this should be tweaked +// + +namespace logging { +using namespace spdlog; +using namespace spdlog::details; +using namespace std::literals; + +class full_formatter final : public spdlog::formatter +{ +public: + full_formatter(std::string_view LogId, std::chrono::time_point<std::chrono::system_clock> Epoch) : m_Epoch(Epoch), m_LogId(LogId) {} + + virtual std::unique_ptr<formatter> clone() const override { return std::make_unique<full_formatter>(m_LogId, m_Epoch); } + + static constexpr bool UseDate = false; + + virtual void format(const details::log_msg& msg, memory_buf_t& dest) override + { + using std::chrono::duration_cast; + using std::chrono::milliseconds; + using std::chrono::seconds; + + if constexpr (UseDate) + { + auto secs = std::chrono::duration_cast<seconds>(msg.time.time_since_epoch()); + if (secs != m_LastLogSecs) + { + m_CachedTm = os::localtime(log_clock::to_time_t(msg.time)); + m_LastLogSecs = secs; + } + } + + const auto& tm_time = m_CachedTm; + + // cache the date/time part for the next second. + auto duration = msg.time - m_Epoch; + auto secs = duration_cast<seconds>(duration); + + if (m_CacheTimestamp != secs || m_CachedDatetime.size() == 0) + { + m_CachedDatetime.clear(); + m_CachedDatetime.push_back('['); + + if constexpr (UseDate) + { + fmt_helper::append_int(tm_time.tm_year + 1900, m_CachedDatetime); + m_CachedDatetime.push_back('-'); + + fmt_helper::pad2(tm_time.tm_mon + 1, m_CachedDatetime); + m_CachedDatetime.push_back('-'); + + fmt_helper::pad2(tm_time.tm_mday, m_CachedDatetime); + m_CachedDatetime.push_back(' '); + + fmt_helper::pad2(tm_time.tm_hour, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + + fmt_helper::pad2(tm_time.tm_min, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + + fmt_helper::pad2(tm_time.tm_sec, m_CachedDatetime); + } + else + { + int Count = int(secs.count()); + + const int LogSecs = Count % 60; + Count /= 60; + + const int LogMins = Count % 60; + Count /= 60; + + const int LogHours = Count; + + fmt_helper::pad2(LogHours, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + fmt_helper::pad2(LogMins, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + fmt_helper::pad2(LogSecs, m_CachedDatetime); + } + + m_CachedDatetime.push_back('.'); + + m_CacheTimestamp = secs; + } + + dest.append(m_CachedDatetime.begin(), m_CachedDatetime.end()); + + auto millis = fmt_helper::time_fraction<milliseconds>(msg.time); + fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest); + dest.push_back(']'); + dest.push_back(' '); + + if (!m_LogId.empty()) + { + dest.push_back('['); + fmt_helper::append_string_view(m_LogId, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + // append logger name if exists + if (msg.logger_name.size() > 0) + { + dest.push_back('['); + fmt_helper::append_string_view(msg.logger_name, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + dest.push_back('['); + // wrap the level name with color + msg.color_range_start = dest.size(); + fmt_helper::append_string_view(level::to_string_view(msg.level), dest); + msg.color_range_end = dest.size(); + dest.push_back(']'); + dest.push_back(' '); + + // add source location if present + if (!msg.source.empty()) + { + dest.push_back('['); + const char* filename = details::short_filename_formatter<details::null_scoped_padder>::basename(msg.source.filename); + fmt_helper::append_string_view(filename, dest); + dest.push_back(':'); + fmt_helper::append_int(msg.source.line, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + fmt_helper::append_string_view(msg.payload, dest); + fmt_helper::append_string_view("\n"sv, dest); + } + +private: + std::chrono::time_point<std::chrono::system_clock> m_Epoch; + std::tm m_CachedTm; + std::chrono::seconds m_LastLogSecs; + std::chrono::seconds m_CacheTimestamp{0}; + memory_buf_t m_CachedDatetime; + std::string m_LogId; +}; +} // namespace logging + +////////////////////////////////////////////////////////////////////////// + +#if 0 +# include <cpr/cpr.h> + +# pragma comment(lib, "Crypt32.lib") +# pragma comment(lib, "Wldap32.lib") + +int +main() +{ + mi_version(); + + zen::Sleep(1000); + + zen::Stopwatch timer; + + const int RequestCount = 100000; + + cpr::Session Sessions[10]; + + for (auto& Session : Sessions) + { + Session.SetUrl(cpr::Url{"http://localhost:1337/test/hello"}); + //Session.SetUrl(cpr::Url{ "http://arn-wd-l0182:1337/test/hello" }); + } + + auto Run = [](cpr::Session& Session) { + for (int i = 0; i < 10000; ++i) + { + cpr::Response Result = Session.Get(); + + if (Result.status_code != 200) + { + spdlog::warn("request response: {}", Result.status_code); + } + } + }; + + Concurrency::parallel_invoke([&] { Run(Sessions[0]); }, + [&] { Run(Sessions[1]); }, + [&] { Run(Sessions[2]); }, + [&] { Run(Sessions[3]); }, + [&] { Run(Sessions[4]); }, + [&] { Run(Sessions[5]); }, + [&] { Run(Sessions[6]); }, + [&] { Run(Sessions[7]); }, + [&] { Run(Sessions[8]); }, + [&] { Run(Sessions[9]); }); + + // cpr::Response r = cpr::Get(cpr::Url{ "http://localhost:1337/test/hello" }); + + spdlog::info("{} requests in {} ({})", + RequestCount, + zen::NiceTimeSpanMs(timer.getElapsedTimeMs()), + zen::NiceRate(RequestCount, (uint32_t)timer.getElapsedTimeMs(), "req")); + + return 0; +} +#elif 0 +//#include <restinio/all.hpp> + +int +main() +{ + mi_version(); + restinio::run(restinio::on_thread_pool(32).port(8080).request_handler( + [](auto req) { return req->create_response().set_body("Hello, World!").done(); })); + return 0; +} +#else + +ZenTestEnvironment TestEnv; + +int +main(int argc, char** argv) +{ + mi_version(); + + zencore_forcelinktests(); + + spdlog::set_level(spdlog::level::debug); + spdlog::set_formatter(std::make_unique<logging::full_formatter>("test", std::chrono::system_clock::now())); + + std::filesystem::path ProgramBaseDir = std::filesystem::path(argv[0]).parent_path(); + std::filesystem::path TestBaseDir = ProgramBaseDir.parent_path().parent_path() / ".test"; + + TestEnv.Initialize(ProgramBaseDir, TestBaseDir); + + spdlog::info("Running tests..."); + return doctest::Context(argc, argv).run(); +} + +# if 1 +TEST_CASE("asio.http") +{ + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + ZenServerInstance Instance(TestEnv); + Instance.SetTestDir(TestDir); + Instance.SpawnServer(13337); + + spdlog::info("Waiting..."); + + Instance.WaitUntilReady(); + + // asio test + + asio::io_context IoContext; + HttpContext HttpCtx(IoContext); + HttpClientRequest Request(HttpCtx); + Request.Get("http://localhost:13337/test/hello"); + + IoContext.run(); +} +# endif + +TEST_CASE("default.single") +{ + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + ZenServerInstance Instance(TestEnv); + Instance.SetTestDir(TestDir); + Instance.SpawnServer(13337); + + spdlog::info("Waiting..."); + + Instance.WaitUntilReady(); + + std::atomic<uint64_t> RequestCount{0}; + std::atomic<uint64_t> BatchCounter{0}; + + spdlog::info("Running single server test..."); + + auto IssueTestRequests = [&] { + const uint64_t BatchNo = BatchCounter.fetch_add(1); + const DWORD ThreadId = GetCurrentThreadId(); + + spdlog::info("query batch {} started (thread {})", BatchNo, ThreadId); + cpr::Session cli; + cli.SetUrl(cpr::Url{"http://localhost:13337/test/hello"}); + + for (int i = 0; i < 10000; ++i) + { + auto res = cli.Get(); + ++RequestCount; + } + spdlog::info("query batch {} ended (thread {})", BatchNo, ThreadId); + }; + + auto fun10 = [&] { + Concurrency::parallel_invoke(IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests); + }; + + zen::Stopwatch timer; + + // Concurrency::parallel_invoke(fun10, fun10, fun, fun, fun, fun, fun, fun, fun, fun); + Concurrency::parallel_invoke(IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests, + IssueTestRequests); + + uint64_t Elapsed = timer.getElapsedTimeMs(); + + spdlog::info("{} requests in {} ({})", + RequestCount, + zen::NiceTimeSpanMs(Elapsed), + zen::NiceRate(RequestCount, (uint32_t)Elapsed, "req")); +} + +TEST_CASE("multi.basic") +{ + ZenServerInstance Instance1(TestEnv); + std::filesystem::path TestDir1 = TestEnv.CreateNewTestDir(); + Instance1.SetTestDir(TestDir1); + Instance1.SpawnServer(13337); + + ZenServerInstance Instance2(TestEnv); + std::filesystem::path TestDir2 = TestEnv.CreateNewTestDir(); + Instance2.SetTestDir(TestDir2); + Instance2.SpawnServer(13338); + + spdlog::info("Waiting..."); + + Instance1.WaitUntilReady(); + Instance2.WaitUntilReady(); + + std::atomic<uint64_t> RequestCount{0}; + std::atomic<uint64_t> BatchCounter{0}; + + auto IssueTestRequests = [&](int PortNumber) { + const uint64_t BatchNo = BatchCounter.fetch_add(1); + const DWORD ThreadId = GetCurrentThreadId(); + + spdlog::info("query batch {} started (thread {}) for port {}", BatchNo, ThreadId, PortNumber); + + cpr::Session cli; + cli.SetUrl(cpr::Url{"http://localhost:{}/test/hello"_format(PortNumber)}); + + for (int i = 0; i < 10000; ++i) + { + auto res = cli.Get(); + ++RequestCount; + } + spdlog::info("query batch {} ended (thread {})", BatchNo, ThreadId); + }; + + zen::Stopwatch timer; + + spdlog::info("Running multi-server test..."); + + Concurrency::parallel_invoke([&] { IssueTestRequests(13337); }, + [&] { IssueTestRequests(13338); }, + [&] { IssueTestRequests(13337); }, + [&] { IssueTestRequests(13338); }); + + uint64_t Elapsed = timer.getElapsedTimeMs(); + + spdlog::info("{} requests in {} ({})", + RequestCount, + zen::NiceTimeSpanMs(Elapsed), + zen::NiceRate(RequestCount, (uint32_t)Elapsed, "req")); +} + +TEST_CASE("cas.basic") +{ + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + const uint16_t PortNumber = 13337; + + const int IterationCount = 1000; + + std::vector<int> ChunkSizes(IterationCount); + std::vector<zen::IoHash> ChunkHashes(IterationCount); + + { + ZenServerInstance Instance1(TestEnv); + Instance1.SetTestDir(TestDir); + Instance1.SpawnServer(PortNumber); + Instance1.WaitUntilReady(); + + std::atomic<uint64_t> RequestCount{0}; + std::atomic<uint64_t> BatchCounter{0}; + + zen::Stopwatch timer; + + std::mt19937_64 mt; + + auto BaseUri = "http://localhost:{}/cas"_format(PortNumber); + + cpr::Session cli; + cli.SetUrl(cpr::Url{BaseUri}); + + // Populate CAS with some generated data + + for (int i = 0; i < IterationCount; ++i) + { + const int ChunkSize = mt() % 10000 + 5; + std::string body = fmt::format("{}", i); + body.resize(ChunkSize, ' '); + + ChunkSizes[i] = ChunkSize; + ChunkHashes[i] = zen::IoHash::HashMemory(body.data(), body.size()); + + cli.SetBody(body); + + auto res = cli.Post(); + CHECK(!res.error); + + ++RequestCount; + } + + // Verify that the chunks persisted + + for (int i = 0; i < IterationCount; ++i) + { + zen::ExtendableStringBuilder<128> Uri; + Uri << BaseUri << "/"; + ChunkHashes[i].ToHexString(Uri); + + auto res = cpr::Get(cpr::Url{Uri.c_str()}); + CHECK(!res.error); + CHECK(res.status_code == 200); + CHECK(res.text.size() == ChunkSizes[i]); + + zen::IoHash Hash = zen::IoHash::HashMemory(res.text.data(), res.text.size()); + + CHECK(ChunkHashes[i] == Hash); + + ++RequestCount; + } + + uint64_t Elapsed = timer.getElapsedTimeMs(); + + spdlog::info("{} requests in {} ({})", + RequestCount, + zen::NiceTimeSpanMs(Elapsed), + zen::NiceRate(RequestCount, (uint32_t)Elapsed, "req")); + } + + // Verify that the data persists between process runs (the previous server has exited at this point) + + { + ZenServerInstance Instance2(TestEnv); + Instance2.SetTestDir(TestDir); + Instance2.SpawnServer(PortNumber); + Instance2.WaitUntilReady(); + + for (int i = 0; i < IterationCount; ++i) + { + zen::ExtendableStringBuilder<128> Uri; + Uri << "http://localhost:{}/cas/"_format(PortNumber); + ChunkHashes[i].ToHexString(Uri); + + auto res = cpr::Get(cpr::Url{Uri.c_str()}); + CHECK(res.status_code == 200); + CHECK(res.text.size() == ChunkSizes[i]); + + zen::IoHash Hash = zen::IoHash::HashMemory(res.text.data(), res.text.size()); + + CHECK(ChunkHashes[i] == Hash); + } + } +} + +TEST_CASE("project.basic") +{ + using namespace std::literals; + + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + const uint16_t PortNumber = 13337; + + ZenServerInstance Instance1(TestEnv); + Instance1.SetTestDir(TestDir); + Instance1.SpawnServer(PortNumber); + Instance1.WaitUntilReady(); + + std::atomic<uint64_t> RequestCount{0}; + + zen::Stopwatch timer; + + std::mt19937_64 mt; + + zen::StringBuilder<64> BaseUri; + BaseUri << "http://localhost:{}/prj/test"_format(PortNumber); + + SUBCASE("build store init") + { + { + { + zen::CbObjectWriter Body; + Body << "id" + << "test"; + Body << "root" + << "/zooom"; + Body << "project" + << "/zooom"; + Body << "engine" + << "/zooom"; + + zen::MemoryOutStream MemOut; + zen::BinaryWriter Writer{MemOut}; + Body.Save(Writer); + + auto Response = cpr::Post(cpr::Url{BaseUri.c_str()}, cpr::Body{(const char*)MemOut.Data(), MemOut.Size()}); + CHECK(Response.status_code == 201); + } + + { + auto Response = cpr::Get(cpr::Url{BaseUri.c_str()}); + CHECK(Response.status_code == 200); + + zen::CbObjectView ResponseObject = zen::CbFieldView(Response.text.data()).AsObjectView(); + + CHECK(ResponseObject["id"].AsString() == "test"sv); + CHECK(ResponseObject["root"].AsString() == "/zooom"sv); + } + } + + BaseUri << "/oplog/ps5"; + + { + { + zen::StringBuilder<64> PostUri; + PostUri << BaseUri; + auto Response = cpr::Post(cpr::Url{PostUri.c_str()}); + CHECK(Response.status_code == 201); + } + + { + auto Response = cpr::Get(cpr::Url{BaseUri.c_str()}); + CHECK(Response.status_code == 200); + + zen::CbObjectView ResponseObject = zen::CbFieldView(Response.text.data()).AsObjectView(); + + CHECK(ResponseObject["id"].AsString() == "ps5"sv); + CHECK(ResponseObject["project"].AsString() == "test"sv); + } + } + + SUBCASE("build store persistence") + { + uint8_t AttachData[] = {1, 2, 3}; + + zen::CbAttachment Attach{zen::SharedBuffer::Clone(zen::MemoryView{AttachData, 3})}; + + zen::CbObjectWriter OpWriter; + OpWriter << "key" + << "foo" + << "attachment" << Attach; + + const std::string_view ChunkId{ + "00000000" + "00000000" + "00010000"}; + auto FileOid = zen::Oid::FromHexString(ChunkId); + + OpWriter.BeginArray("files"); + OpWriter.BeginObject(); + OpWriter << "id" << FileOid; + OpWriter << "path" << __FILE__; + OpWriter.EndObject(); + OpWriter.EndArray(); + + OpWriter.BeginArray("serverfiles"); + OpWriter.BeginObject(); + OpWriter << "id" << FileOid; + OpWriter << "path" << __FILE__; + OpWriter.EndObject(); + OpWriter.EndArray(); + + zen::CbObject Op = OpWriter.Save(); + + zen::MemoryOutStream MemOut; + zen::BinaryWriter Writer(MemOut); + zen::CbPackage OpPackage(Op); + OpPackage.AddAttachment(Attach); + OpPackage.Save(Writer); + + { + zen::StringBuilder<64> PostUri; + PostUri << BaseUri << "/new"; + auto Response = cpr::Post(cpr::Url{PostUri.c_str()}, cpr::Body{(const char*)MemOut.Data(), MemOut.Size()}); + + REQUIRE(!Response.error); + CHECK(Response.status_code == 201); + } + + // Read file data + + { + zen::StringBuilder<128> ChunkGetUri; + ChunkGetUri << BaseUri << "/" << ChunkId; + auto Response = cpr::Get(cpr::Url{ChunkGetUri.c_str()}); + + REQUIRE(!Response.error); + CHECK(Response.status_code == 200); + } + + { + zen::StringBuilder<128> ChunkGetUri; + ChunkGetUri << BaseUri << "/" << ChunkId << "?offset=1&size=10"; + auto Response = cpr::Get(cpr::Url{ChunkGetUri.c_str()}); + + REQUIRE(!Response.error); + CHECK(Response.status_code == 200); + CHECK(Response.text.size() == 10); + } + + spdlog::info("+++++++"); + } + SUBCASE("build store op commit") { spdlog::info("-------"); } + } + + const uint64_t Elapsed = timer.getElapsedTimeMs(); + + spdlog::info("{} requests in {} ({})", + RequestCount, + zen::NiceTimeSpanMs(Elapsed), + zen::NiceRate(RequestCount, (uint32_t)Elapsed, "req")); +} + +TEST_CASE("project.pipe") +{ + using namespace std::literals; + + std::filesystem::path TestDir = TestEnv.CreateNewTestDir(); + + const uint16_t PortNumber = 13337; + + ZenServerInstance Instance1(TestEnv); + Instance1.SetTestDir(TestDir); + Instance1.SpawnServer(PortNumber); + Instance1.WaitUntilReady(); + + zen::LocalProjectClient LocalClient(PortNumber); + + zen::CbObjectWriter Cbow; + Cbow << "hey" << 42; + + zen::CbObject Response = LocalClient.MessageTransaction(Cbow.Save()); +} + +#endif diff --git a/zenserver-test/zenserver-test.vcxproj b/zenserver-test/zenserver-test.vcxproj new file mode 100644 index 000000000..8cf7df84d --- /dev/null +++ b/zenserver-test/zenserver-test.vcxproj @@ -0,0 +1,114 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{2563249e-e695-4cc4-8ffa-335d07680c9d}</ProjectGuid> + <RootNamespace>zenservertest</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_release.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\zencore\include</IncludePath> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\zencore\include</IncludePath> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>$(SolutionDir)\zentestutil\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>$(SolutionDir)\zentestutil\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ProjectReference Include="..\zencore\zencore.vcxproj"> + <Project>{d75bf9ab-c61e-4fff-ad59-1563430f05e2}</Project> + </ProjectReference> + <ProjectReference Include="..\zentestutil\zentestutil.vcxproj"> + <Project>{77f8315d-b21d-4db0-9a6f-2d3359f88a70}</Project> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <ClCompile Include="projectclient.cpp" /> + <ClCompile Include="zenserver-test.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="projectclient.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zenserver-test/zenserver-test.vcxproj.filters b/zenserver-test/zenserver-test.vcxproj.filters new file mode 100644 index 000000000..59f077ab0 --- /dev/null +++ b/zenserver-test/zenserver-test.vcxproj.filters @@ -0,0 +1,10 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="zenserver-test.cpp" /> + <ClCompile Include="projectclient.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="projectclient.h" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zenserver/admin/admin.h b/zenserver/admin/admin.h new file mode 100644 index 000000000..3bb8a9158 --- /dev/null +++ b/zenserver/admin/admin.h @@ -0,0 +1,18 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> + +class HttpAdminService : public zen::HttpService +{ +public: + HttpAdminService() = default; + ~HttpAdminService() = default; + + virtual const char* BaseUri() const override { return "/admin/"; } + + virtual void HandleRequest(zen::HttpServerRequest& Request) override { ZEN_UNUSED(Request); } + +private: +}; diff --git a/zenserver/cache/cacheagent.cpp b/zenserver/cache/cacheagent.cpp new file mode 100644 index 000000000..f4d1cabe6 --- /dev/null +++ b/zenserver/cache/cacheagent.cpp @@ -0,0 +1,5 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "cacheagent.h" + +#include <gsl/gsl-lite.hpp> diff --git a/zenserver/cache/cacheagent.h b/zenserver/cache/cacheagent.h new file mode 100644 index 000000000..145d0f79f --- /dev/null +++ b/zenserver/cache/cacheagent.h @@ -0,0 +1,9 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +class CacheAgent +{ +public: +private: +}; diff --git a/zenserver/cache/cachestore.cpp b/zenserver/cache/cachestore.cpp new file mode 100644 index 000000000..fc218de6b --- /dev/null +++ b/zenserver/cache/cachestore.cpp @@ -0,0 +1,1235 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "cachestore.h" + +#include <zencore/windows.h> + +#include <fmt/core.h> +#include <spdlog/spdlog.h> +#include <zencore/filesystem.h> +#include <zencore/iobuffer.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zenstore/cas.h> +#include <filesystem> +#include <gsl/gsl-lite.hpp> +#include <unordered_map> + +#include <atlfile.h> + +using namespace zen; + +namespace UE { + +static const uint32_t CRCTable[256] = { + 0x00000000, 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B, 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, + 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD, 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC, 0x5BD4B01B, + 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F, 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A, 0x745E66CD, 0x9823B6E0, + 0x9CE2AB57, 0x91A18D8E, 0x95609039, 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58, 0xBAEA46EF, 0xB7A96036, 0xB3687D81, + 0xAD2F2D84, 0xA9EE3033, 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE, 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, + 0xCA753D95, 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4, 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0, + 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5, 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16, 0x018AEB13, + 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07, 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C, 0x6211E6B5, 0x66D0FB02, + 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1, 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA, 0xACA5C697, 0xA864DB20, 0xA527FDF9, + 0xA1E6E04E, 0xBFA1B04B, 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698, 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, + 0x902B669D, 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E, 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F, + 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34, 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80, 0x644FC637, + 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB, 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A, 0x58C1663D, 0x558240E4, + 0x51435D53, 0x251D3B9E, 0x21DC2629, 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C, 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, + 0x0A97ED48, 0x0E56F0FF, 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E, 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, + 0xE6EA3D65, 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8, 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3, + 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2, 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71, 0x92B45BA8, + 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74, 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640, 0x4E8EE645, 0x4A4FFBF2, + 0x470CDD2B, 0x43CDC09C, 0x7B827D21, 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A, 0x61043093, 0x65C52D24, 0x119B4BE9, + 0x155A565E, 0x18197087, 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC, 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, + 0x2497D08D, 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE, 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB, + 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18, 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09, 0x8D79E0BE, + 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662, 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF, 0xA2F33668, 0xBCB4666D, + 0xB8757BDA, 0xB5365D03, 0xB1F740B4}; + +static const uint32_t CRCTablesSB8[8][256] = { + {0x00000000, 0xb71dc104, 0x6e3b8209, 0xd926430d, 0xdc760413, 0x6b6bc517, 0xb24d861a, 0x0550471e, 0xb8ed0826, 0x0ff0c922, 0xd6d68a2f, + 0x61cb4b2b, 0x649b0c35, 0xd386cd31, 0x0aa08e3c, 0xbdbd4f38, 0x70db114c, 0xc7c6d048, 0x1ee09345, 0xa9fd5241, 0xacad155f, 0x1bb0d45b, + 0xc2969756, 0x758b5652, 0xc836196a, 0x7f2bd86e, 0xa60d9b63, 0x11105a67, 0x14401d79, 0xa35ddc7d, 0x7a7b9f70, 0xcd665e74, 0xe0b62398, + 0x57abe29c, 0x8e8da191, 0x39906095, 0x3cc0278b, 0x8bdde68f, 0x52fba582, 0xe5e66486, 0x585b2bbe, 0xef46eaba, 0x3660a9b7, 0x817d68b3, + 0x842d2fad, 0x3330eea9, 0xea16ada4, 0x5d0b6ca0, 0x906d32d4, 0x2770f3d0, 0xfe56b0dd, 0x494b71d9, 0x4c1b36c7, 0xfb06f7c3, 0x2220b4ce, + 0x953d75ca, 0x28803af2, 0x9f9dfbf6, 0x46bbb8fb, 0xf1a679ff, 0xf4f63ee1, 0x43ebffe5, 0x9acdbce8, 0x2dd07dec, 0x77708634, 0xc06d4730, + 0x194b043d, 0xae56c539, 0xab068227, 0x1c1b4323, 0xc53d002e, 0x7220c12a, 0xcf9d8e12, 0x78804f16, 0xa1a60c1b, 0x16bbcd1f, 0x13eb8a01, + 0xa4f64b05, 0x7dd00808, 0xcacdc90c, 0x07ab9778, 0xb0b6567c, 0x69901571, 0xde8dd475, 0xdbdd936b, 0x6cc0526f, 0xb5e61162, 0x02fbd066, + 0xbf469f5e, 0x085b5e5a, 0xd17d1d57, 0x6660dc53, 0x63309b4d, 0xd42d5a49, 0x0d0b1944, 0xba16d840, 0x97c6a5ac, 0x20db64a8, 0xf9fd27a5, + 0x4ee0e6a1, 0x4bb0a1bf, 0xfcad60bb, 0x258b23b6, 0x9296e2b2, 0x2f2bad8a, 0x98366c8e, 0x41102f83, 0xf60dee87, 0xf35da999, 0x4440689d, + 0x9d662b90, 0x2a7bea94, 0xe71db4e0, 0x500075e4, 0x892636e9, 0x3e3bf7ed, 0x3b6bb0f3, 0x8c7671f7, 0x555032fa, 0xe24df3fe, 0x5ff0bcc6, + 0xe8ed7dc2, 0x31cb3ecf, 0x86d6ffcb, 0x8386b8d5, 0x349b79d1, 0xedbd3adc, 0x5aa0fbd8, 0xeee00c69, 0x59fdcd6d, 0x80db8e60, 0x37c64f64, + 0x3296087a, 0x858bc97e, 0x5cad8a73, 0xebb04b77, 0x560d044f, 0xe110c54b, 0x38368646, 0x8f2b4742, 0x8a7b005c, 0x3d66c158, 0xe4408255, + 0x535d4351, 0x9e3b1d25, 0x2926dc21, 0xf0009f2c, 0x471d5e28, 0x424d1936, 0xf550d832, 0x2c769b3f, 0x9b6b5a3b, 0x26d61503, 0x91cbd407, + 0x48ed970a, 0xfff0560e, 0xfaa01110, 0x4dbdd014, 0x949b9319, 0x2386521d, 0x0e562ff1, 0xb94beef5, 0x606dadf8, 0xd7706cfc, 0xd2202be2, + 0x653deae6, 0xbc1ba9eb, 0x0b0668ef, 0xb6bb27d7, 0x01a6e6d3, 0xd880a5de, 0x6f9d64da, 0x6acd23c4, 0xddd0e2c0, 0x04f6a1cd, 0xb3eb60c9, + 0x7e8d3ebd, 0xc990ffb9, 0x10b6bcb4, 0xa7ab7db0, 0xa2fb3aae, 0x15e6fbaa, 0xccc0b8a7, 0x7bdd79a3, 0xc660369b, 0x717df79f, 0xa85bb492, + 0x1f467596, 0x1a163288, 0xad0bf38c, 0x742db081, 0xc3307185, 0x99908a5d, 0x2e8d4b59, 0xf7ab0854, 0x40b6c950, 0x45e68e4e, 0xf2fb4f4a, + 0x2bdd0c47, 0x9cc0cd43, 0x217d827b, 0x9660437f, 0x4f460072, 0xf85bc176, 0xfd0b8668, 0x4a16476c, 0x93300461, 0x242dc565, 0xe94b9b11, + 0x5e565a15, 0x87701918, 0x306dd81c, 0x353d9f02, 0x82205e06, 0x5b061d0b, 0xec1bdc0f, 0x51a69337, 0xe6bb5233, 0x3f9d113e, 0x8880d03a, + 0x8dd09724, 0x3acd5620, 0xe3eb152d, 0x54f6d429, 0x7926a9c5, 0xce3b68c1, 0x171d2bcc, 0xa000eac8, 0xa550add6, 0x124d6cd2, 0xcb6b2fdf, + 0x7c76eedb, 0xc1cba1e3, 0x76d660e7, 0xaff023ea, 0x18ede2ee, 0x1dbda5f0, 0xaaa064f4, 0x738627f9, 0xc49be6fd, 0x09fdb889, 0xbee0798d, + 0x67c63a80, 0xd0dbfb84, 0xd58bbc9a, 0x62967d9e, 0xbbb03e93, 0x0cadff97, 0xb110b0af, 0x060d71ab, 0xdf2b32a6, 0x6836f3a2, 0x6d66b4bc, + 0xda7b75b8, 0x035d36b5, 0xb440f7b1}, + {0x00000000, 0xdcc119d2, 0x0f9ef2a0, 0xd35feb72, 0xa9212445, 0x75e03d97, 0xa6bfd6e5, 0x7a7ecf37, 0x5243488a, 0x8e825158, 0x5dddba2a, + 0x811ca3f8, 0xfb626ccf, 0x27a3751d, 0xf4fc9e6f, 0x283d87bd, 0x139b5110, 0xcf5a48c2, 0x1c05a3b0, 0xc0c4ba62, 0xbaba7555, 0x667b6c87, + 0xb52487f5, 0x69e59e27, 0x41d8199a, 0x9d190048, 0x4e46eb3a, 0x9287f2e8, 0xe8f93ddf, 0x3438240d, 0xe767cf7f, 0x3ba6d6ad, 0x2636a320, + 0xfaf7baf2, 0x29a85180, 0xf5694852, 0x8f178765, 0x53d69eb7, 0x808975c5, 0x5c486c17, 0x7475ebaa, 0xa8b4f278, 0x7beb190a, 0xa72a00d8, + 0xdd54cfef, 0x0195d63d, 0xd2ca3d4f, 0x0e0b249d, 0x35adf230, 0xe96cebe2, 0x3a330090, 0xe6f21942, 0x9c8cd675, 0x404dcfa7, 0x931224d5, + 0x4fd33d07, 0x67eebaba, 0xbb2fa368, 0x6870481a, 0xb4b151c8, 0xcecf9eff, 0x120e872d, 0xc1516c5f, 0x1d90758d, 0x4c6c4641, 0x90ad5f93, + 0x43f2b4e1, 0x9f33ad33, 0xe54d6204, 0x398c7bd6, 0xead390a4, 0x36128976, 0x1e2f0ecb, 0xc2ee1719, 0x11b1fc6b, 0xcd70e5b9, 0xb70e2a8e, + 0x6bcf335c, 0xb890d82e, 0x6451c1fc, 0x5ff71751, 0x83360e83, 0x5069e5f1, 0x8ca8fc23, 0xf6d63314, 0x2a172ac6, 0xf948c1b4, 0x2589d866, + 0x0db45fdb, 0xd1754609, 0x022aad7b, 0xdeebb4a9, 0xa4957b9e, 0x7854624c, 0xab0b893e, 0x77ca90ec, 0x6a5ae561, 0xb69bfcb3, 0x65c417c1, + 0xb9050e13, 0xc37bc124, 0x1fbad8f6, 0xcce53384, 0x10242a56, 0x3819adeb, 0xe4d8b439, 0x37875f4b, 0xeb464699, 0x913889ae, 0x4df9907c, + 0x9ea67b0e, 0x426762dc, 0x79c1b471, 0xa500ada3, 0x765f46d1, 0xaa9e5f03, 0xd0e09034, 0x0c2189e6, 0xdf7e6294, 0x03bf7b46, 0x2b82fcfb, + 0xf743e529, 0x241c0e5b, 0xf8dd1789, 0x82a3d8be, 0x5e62c16c, 0x8d3d2a1e, 0x51fc33cc, 0x98d88c82, 0x44199550, 0x97467e22, 0x4b8767f0, + 0x31f9a8c7, 0xed38b115, 0x3e675a67, 0xe2a643b5, 0xca9bc408, 0x165addda, 0xc50536a8, 0x19c42f7a, 0x63bae04d, 0xbf7bf99f, 0x6c2412ed, + 0xb0e50b3f, 0x8b43dd92, 0x5782c440, 0x84dd2f32, 0x581c36e0, 0x2262f9d7, 0xfea3e005, 0x2dfc0b77, 0xf13d12a5, 0xd9009518, 0x05c18cca, + 0xd69e67b8, 0x0a5f7e6a, 0x7021b15d, 0xace0a88f, 0x7fbf43fd, 0xa37e5a2f, 0xbeee2fa2, 0x622f3670, 0xb170dd02, 0x6db1c4d0, 0x17cf0be7, + 0xcb0e1235, 0x1851f947, 0xc490e095, 0xecad6728, 0x306c7efa, 0xe3339588, 0x3ff28c5a, 0x458c436d, 0x994d5abf, 0x4a12b1cd, 0x96d3a81f, + 0xad757eb2, 0x71b46760, 0xa2eb8c12, 0x7e2a95c0, 0x04545af7, 0xd8954325, 0x0bcaa857, 0xd70bb185, 0xff363638, 0x23f72fea, 0xf0a8c498, + 0x2c69dd4a, 0x5617127d, 0x8ad60baf, 0x5989e0dd, 0x8548f90f, 0xd4b4cac3, 0x0875d311, 0xdb2a3863, 0x07eb21b1, 0x7d95ee86, 0xa154f754, + 0x720b1c26, 0xaeca05f4, 0x86f78249, 0x5a369b9b, 0x896970e9, 0x55a8693b, 0x2fd6a60c, 0xf317bfde, 0x204854ac, 0xfc894d7e, 0xc72f9bd3, + 0x1bee8201, 0xc8b16973, 0x147070a1, 0x6e0ebf96, 0xb2cfa644, 0x61904d36, 0xbd5154e4, 0x956cd359, 0x49adca8b, 0x9af221f9, 0x4633382b, + 0x3c4df71c, 0xe08ceece, 0x33d305bc, 0xef121c6e, 0xf28269e3, 0x2e437031, 0xfd1c9b43, 0x21dd8291, 0x5ba34da6, 0x87625474, 0x543dbf06, + 0x88fca6d4, 0xa0c12169, 0x7c0038bb, 0xaf5fd3c9, 0x739eca1b, 0x09e0052c, 0xd5211cfe, 0x067ef78c, 0xdabfee5e, 0xe11938f3, 0x3dd82121, + 0xee87ca53, 0x3246d381, 0x48381cb6, 0x94f90564, 0x47a6ee16, 0x9b67f7c4, 0xb35a7079, 0x6f9b69ab, 0xbcc482d9, 0x60059b0b, 0x1a7b543c, + 0xc6ba4dee, 0x15e5a69c, 0xc924bf4e}, + {0x00000000, 0x87acd801, 0x0e59b103, 0x89f56902, 0x1cb26207, 0x9b1eba06, 0x12ebd304, 0x95470b05, 0x3864c50e, 0xbfc81d0f, 0x363d740d, + 0xb191ac0c, 0x24d6a709, 0xa37a7f08, 0x2a8f160a, 0xad23ce0b, 0x70c88a1d, 0xf764521c, 0x7e913b1e, 0xf93de31f, 0x6c7ae81a, 0xebd6301b, + 0x62235919, 0xe58f8118, 0x48ac4f13, 0xcf009712, 0x46f5fe10, 0xc1592611, 0x541e2d14, 0xd3b2f515, 0x5a479c17, 0xddeb4416, 0xe090153b, + 0x673ccd3a, 0xeec9a438, 0x69657c39, 0xfc22773c, 0x7b8eaf3d, 0xf27bc63f, 0x75d71e3e, 0xd8f4d035, 0x5f580834, 0xd6ad6136, 0x5101b937, + 0xc446b232, 0x43ea6a33, 0xca1f0331, 0x4db3db30, 0x90589f26, 0x17f44727, 0x9e012e25, 0x19adf624, 0x8ceafd21, 0x0b462520, 0x82b34c22, + 0x051f9423, 0xa83c5a28, 0x2f908229, 0xa665eb2b, 0x21c9332a, 0xb48e382f, 0x3322e02e, 0xbad7892c, 0x3d7b512d, 0xc0212b76, 0x478df377, + 0xce789a75, 0x49d44274, 0xdc934971, 0x5b3f9170, 0xd2caf872, 0x55662073, 0xf845ee78, 0x7fe93679, 0xf61c5f7b, 0x71b0877a, 0xe4f78c7f, + 0x635b547e, 0xeaae3d7c, 0x6d02e57d, 0xb0e9a16b, 0x3745796a, 0xbeb01068, 0x391cc869, 0xac5bc36c, 0x2bf71b6d, 0xa202726f, 0x25aeaa6e, + 0x888d6465, 0x0f21bc64, 0x86d4d566, 0x01780d67, 0x943f0662, 0x1393de63, 0x9a66b761, 0x1dca6f60, 0x20b13e4d, 0xa71de64c, 0x2ee88f4e, + 0xa944574f, 0x3c035c4a, 0xbbaf844b, 0x325aed49, 0xb5f63548, 0x18d5fb43, 0x9f792342, 0x168c4a40, 0x91209241, 0x04679944, 0x83cb4145, + 0x0a3e2847, 0x8d92f046, 0x5079b450, 0xd7d56c51, 0x5e200553, 0xd98cdd52, 0x4ccbd657, 0xcb670e56, 0x42926754, 0xc53ebf55, 0x681d715e, + 0xefb1a95f, 0x6644c05d, 0xe1e8185c, 0x74af1359, 0xf303cb58, 0x7af6a25a, 0xfd5a7a5b, 0x804356ec, 0x07ef8eed, 0x8e1ae7ef, 0x09b63fee, + 0x9cf134eb, 0x1b5decea, 0x92a885e8, 0x15045de9, 0xb82793e2, 0x3f8b4be3, 0xb67e22e1, 0x31d2fae0, 0xa495f1e5, 0x233929e4, 0xaacc40e6, + 0x2d6098e7, 0xf08bdcf1, 0x772704f0, 0xfed26df2, 0x797eb5f3, 0xec39bef6, 0x6b9566f7, 0xe2600ff5, 0x65ccd7f4, 0xc8ef19ff, 0x4f43c1fe, + 0xc6b6a8fc, 0x411a70fd, 0xd45d7bf8, 0x53f1a3f9, 0xda04cafb, 0x5da812fa, 0x60d343d7, 0xe77f9bd6, 0x6e8af2d4, 0xe9262ad5, 0x7c6121d0, + 0xfbcdf9d1, 0x723890d3, 0xf59448d2, 0x58b786d9, 0xdf1b5ed8, 0x56ee37da, 0xd142efdb, 0x4405e4de, 0xc3a93cdf, 0x4a5c55dd, 0xcdf08ddc, + 0x101bc9ca, 0x97b711cb, 0x1e4278c9, 0x99eea0c8, 0x0ca9abcd, 0x8b0573cc, 0x02f01ace, 0x855cc2cf, 0x287f0cc4, 0xafd3d4c5, 0x2626bdc7, + 0xa18a65c6, 0x34cd6ec3, 0xb361b6c2, 0x3a94dfc0, 0xbd3807c1, 0x40627d9a, 0xc7cea59b, 0x4e3bcc99, 0xc9971498, 0x5cd01f9d, 0xdb7cc79c, + 0x5289ae9e, 0xd525769f, 0x7806b894, 0xffaa6095, 0x765f0997, 0xf1f3d196, 0x64b4da93, 0xe3180292, 0x6aed6b90, 0xed41b391, 0x30aaf787, + 0xb7062f86, 0x3ef34684, 0xb95f9e85, 0x2c189580, 0xabb44d81, 0x22412483, 0xa5edfc82, 0x08ce3289, 0x8f62ea88, 0x0697838a, 0x813b5b8b, + 0x147c508e, 0x93d0888f, 0x1a25e18d, 0x9d89398c, 0xa0f268a1, 0x275eb0a0, 0xaeabd9a2, 0x290701a3, 0xbc400aa6, 0x3becd2a7, 0xb219bba5, + 0x35b563a4, 0x9896adaf, 0x1f3a75ae, 0x96cf1cac, 0x1163c4ad, 0x8424cfa8, 0x038817a9, 0x8a7d7eab, 0x0dd1a6aa, 0xd03ae2bc, 0x57963abd, + 0xde6353bf, 0x59cf8bbe, 0xcc8880bb, 0x4b2458ba, 0xc2d131b8, 0x457de9b9, 0xe85e27b2, 0x6ff2ffb3, 0xe60796b1, 0x61ab4eb0, 0xf4ec45b5, + 0x73409db4, 0xfab5f4b6, 0x7d192cb7}, + {0x00000000, 0xb79a6ddc, 0xd9281abc, 0x6eb27760, 0x054cf57c, 0xb2d698a0, 0xdc64efc0, 0x6bfe821c, 0x0a98eaf9, 0xbd028725, 0xd3b0f045, + 0x642a9d99, 0x0fd41f85, 0xb84e7259, 0xd6fc0539, 0x616668e5, 0xa32d14f7, 0x14b7792b, 0x7a050e4b, 0xcd9f6397, 0xa661e18b, 0x11fb8c57, + 0x7f49fb37, 0xc8d396eb, 0xa9b5fe0e, 0x1e2f93d2, 0x709de4b2, 0xc707896e, 0xacf90b72, 0x1b6366ae, 0x75d111ce, 0xc24b7c12, 0xf146e9ea, + 0x46dc8436, 0x286ef356, 0x9ff49e8a, 0xf40a1c96, 0x4390714a, 0x2d22062a, 0x9ab86bf6, 0xfbde0313, 0x4c446ecf, 0x22f619af, 0x956c7473, + 0xfe92f66f, 0x49089bb3, 0x27baecd3, 0x9020810f, 0x526bfd1d, 0xe5f190c1, 0x8b43e7a1, 0x3cd98a7d, 0x57270861, 0xe0bd65bd, 0x8e0f12dd, + 0x39957f01, 0x58f317e4, 0xef697a38, 0x81db0d58, 0x36416084, 0x5dbfe298, 0xea258f44, 0x8497f824, 0x330d95f8, 0x559013d1, 0xe20a7e0d, + 0x8cb8096d, 0x3b2264b1, 0x50dce6ad, 0xe7468b71, 0x89f4fc11, 0x3e6e91cd, 0x5f08f928, 0xe89294f4, 0x8620e394, 0x31ba8e48, 0x5a440c54, + 0xedde6188, 0x836c16e8, 0x34f67b34, 0xf6bd0726, 0x41276afa, 0x2f951d9a, 0x980f7046, 0xf3f1f25a, 0x446b9f86, 0x2ad9e8e6, 0x9d43853a, + 0xfc25eddf, 0x4bbf8003, 0x250df763, 0x92979abf, 0xf96918a3, 0x4ef3757f, 0x2041021f, 0x97db6fc3, 0xa4d6fa3b, 0x134c97e7, 0x7dfee087, + 0xca648d5b, 0xa19a0f47, 0x1600629b, 0x78b215fb, 0xcf287827, 0xae4e10c2, 0x19d47d1e, 0x77660a7e, 0xc0fc67a2, 0xab02e5be, 0x1c988862, + 0x722aff02, 0xc5b092de, 0x07fbeecc, 0xb0618310, 0xded3f470, 0x694999ac, 0x02b71bb0, 0xb52d766c, 0xdb9f010c, 0x6c056cd0, 0x0d630435, + 0xbaf969e9, 0xd44b1e89, 0x63d17355, 0x082ff149, 0xbfb59c95, 0xd107ebf5, 0x669d8629, 0x1d3de6a6, 0xaaa78b7a, 0xc415fc1a, 0x738f91c6, + 0x187113da, 0xafeb7e06, 0xc1590966, 0x76c364ba, 0x17a50c5f, 0xa03f6183, 0xce8d16e3, 0x79177b3f, 0x12e9f923, 0xa57394ff, 0xcbc1e39f, + 0x7c5b8e43, 0xbe10f251, 0x098a9f8d, 0x6738e8ed, 0xd0a28531, 0xbb5c072d, 0x0cc66af1, 0x62741d91, 0xd5ee704d, 0xb48818a8, 0x03127574, + 0x6da00214, 0xda3a6fc8, 0xb1c4edd4, 0x065e8008, 0x68ecf768, 0xdf769ab4, 0xec7b0f4c, 0x5be16290, 0x355315f0, 0x82c9782c, 0xe937fa30, + 0x5ead97ec, 0x301fe08c, 0x87858d50, 0xe6e3e5b5, 0x51798869, 0x3fcbff09, 0x885192d5, 0xe3af10c9, 0x54357d15, 0x3a870a75, 0x8d1d67a9, + 0x4f561bbb, 0xf8cc7667, 0x967e0107, 0x21e46cdb, 0x4a1aeec7, 0xfd80831b, 0x9332f47b, 0x24a899a7, 0x45cef142, 0xf2549c9e, 0x9ce6ebfe, + 0x2b7c8622, 0x4082043e, 0xf71869e2, 0x99aa1e82, 0x2e30735e, 0x48adf577, 0xff3798ab, 0x9185efcb, 0x261f8217, 0x4de1000b, 0xfa7b6dd7, + 0x94c91ab7, 0x2353776b, 0x42351f8e, 0xf5af7252, 0x9b1d0532, 0x2c8768ee, 0x4779eaf2, 0xf0e3872e, 0x9e51f04e, 0x29cb9d92, 0xeb80e180, + 0x5c1a8c5c, 0x32a8fb3c, 0x853296e0, 0xeecc14fc, 0x59567920, 0x37e40e40, 0x807e639c, 0xe1180b79, 0x568266a5, 0x383011c5, 0x8faa7c19, + 0xe454fe05, 0x53ce93d9, 0x3d7ce4b9, 0x8ae68965, 0xb9eb1c9d, 0x0e717141, 0x60c30621, 0xd7596bfd, 0xbca7e9e1, 0x0b3d843d, 0x658ff35d, + 0xd2159e81, 0xb373f664, 0x04e99bb8, 0x6a5becd8, 0xddc18104, 0xb63f0318, 0x01a56ec4, 0x6f1719a4, 0xd88d7478, 0x1ac6086a, 0xad5c65b6, + 0xc3ee12d6, 0x74747f0a, 0x1f8afd16, 0xa81090ca, 0xc6a2e7aa, 0x71388a76, 0x105ee293, 0xa7c48f4f, 0xc976f82f, 0x7eec95f3, 0x151217ef, + 0xa2887a33, 0xcc3a0d53, 0x7ba0608f}, + {0x00000000, 0x8d670d49, 0x1acf1a92, 0x97a817db, 0x8383f420, 0x0ee4f969, 0x994ceeb2, 0x142be3fb, 0x0607e941, 0x8b60e408, 0x1cc8f3d3, + 0x91affe9a, 0x85841d61, 0x08e31028, 0x9f4b07f3, 0x122c0aba, 0x0c0ed283, 0x8169dfca, 0x16c1c811, 0x9ba6c558, 0x8f8d26a3, 0x02ea2bea, + 0x95423c31, 0x18253178, 0x0a093bc2, 0x876e368b, 0x10c62150, 0x9da12c19, 0x898acfe2, 0x04edc2ab, 0x9345d570, 0x1e22d839, 0xaf016503, + 0x2266684a, 0xb5ce7f91, 0x38a972d8, 0x2c829123, 0xa1e59c6a, 0x364d8bb1, 0xbb2a86f8, 0xa9068c42, 0x2461810b, 0xb3c996d0, 0x3eae9b99, + 0x2a857862, 0xa7e2752b, 0x304a62f0, 0xbd2d6fb9, 0xa30fb780, 0x2e68bac9, 0xb9c0ad12, 0x34a7a05b, 0x208c43a0, 0xadeb4ee9, 0x3a435932, + 0xb724547b, 0xa5085ec1, 0x286f5388, 0xbfc74453, 0x32a0491a, 0x268baae1, 0xabeca7a8, 0x3c44b073, 0xb123bd3a, 0x5e03ca06, 0xd364c74f, + 0x44ccd094, 0xc9abdddd, 0xdd803e26, 0x50e7336f, 0xc74f24b4, 0x4a2829fd, 0x58042347, 0xd5632e0e, 0x42cb39d5, 0xcfac349c, 0xdb87d767, + 0x56e0da2e, 0xc148cdf5, 0x4c2fc0bc, 0x520d1885, 0xdf6a15cc, 0x48c20217, 0xc5a50f5e, 0xd18eeca5, 0x5ce9e1ec, 0xcb41f637, 0x4626fb7e, + 0x540af1c4, 0xd96dfc8d, 0x4ec5eb56, 0xc3a2e61f, 0xd78905e4, 0x5aee08ad, 0xcd461f76, 0x4021123f, 0xf102af05, 0x7c65a24c, 0xebcdb597, + 0x66aab8de, 0x72815b25, 0xffe6566c, 0x684e41b7, 0xe5294cfe, 0xf7054644, 0x7a624b0d, 0xedca5cd6, 0x60ad519f, 0x7486b264, 0xf9e1bf2d, + 0x6e49a8f6, 0xe32ea5bf, 0xfd0c7d86, 0x706b70cf, 0xe7c36714, 0x6aa46a5d, 0x7e8f89a6, 0xf3e884ef, 0x64409334, 0xe9279e7d, 0xfb0b94c7, + 0x766c998e, 0xe1c48e55, 0x6ca3831c, 0x788860e7, 0xf5ef6dae, 0x62477a75, 0xef20773c, 0xbc06940d, 0x31619944, 0xa6c98e9f, 0x2bae83d6, + 0x3f85602d, 0xb2e26d64, 0x254a7abf, 0xa82d77f6, 0xba017d4c, 0x37667005, 0xa0ce67de, 0x2da96a97, 0x3982896c, 0xb4e58425, 0x234d93fe, + 0xae2a9eb7, 0xb008468e, 0x3d6f4bc7, 0xaac75c1c, 0x27a05155, 0x338bb2ae, 0xbeecbfe7, 0x2944a83c, 0xa423a575, 0xb60fafcf, 0x3b68a286, + 0xacc0b55d, 0x21a7b814, 0x358c5bef, 0xb8eb56a6, 0x2f43417d, 0xa2244c34, 0x1307f10e, 0x9e60fc47, 0x09c8eb9c, 0x84afe6d5, 0x9084052e, + 0x1de30867, 0x8a4b1fbc, 0x072c12f5, 0x1500184f, 0x98671506, 0x0fcf02dd, 0x82a80f94, 0x9683ec6f, 0x1be4e126, 0x8c4cf6fd, 0x012bfbb4, + 0x1f09238d, 0x926e2ec4, 0x05c6391f, 0x88a13456, 0x9c8ad7ad, 0x11eddae4, 0x8645cd3f, 0x0b22c076, 0x190ecacc, 0x9469c785, 0x03c1d05e, + 0x8ea6dd17, 0x9a8d3eec, 0x17ea33a5, 0x8042247e, 0x0d252937, 0xe2055e0b, 0x6f625342, 0xf8ca4499, 0x75ad49d0, 0x6186aa2b, 0xece1a762, + 0x7b49b0b9, 0xf62ebdf0, 0xe402b74a, 0x6965ba03, 0xfecdadd8, 0x73aaa091, 0x6781436a, 0xeae64e23, 0x7d4e59f8, 0xf02954b1, 0xee0b8c88, + 0x636c81c1, 0xf4c4961a, 0x79a39b53, 0x6d8878a8, 0xe0ef75e1, 0x7747623a, 0xfa206f73, 0xe80c65c9, 0x656b6880, 0xf2c37f5b, 0x7fa47212, + 0x6b8f91e9, 0xe6e89ca0, 0x71408b7b, 0xfc278632, 0x4d043b08, 0xc0633641, 0x57cb219a, 0xdaac2cd3, 0xce87cf28, 0x43e0c261, 0xd448d5ba, + 0x592fd8f3, 0x4b03d249, 0xc664df00, 0x51ccc8db, 0xdcabc592, 0xc8802669, 0x45e72b20, 0xd24f3cfb, 0x5f2831b2, 0x410ae98b, 0xcc6de4c2, + 0x5bc5f319, 0xd6a2fe50, 0xc2891dab, 0x4fee10e2, 0xd8460739, 0x55210a70, 0x470d00ca, 0xca6a0d83, 0x5dc21a58, 0xd0a51711, 0xc48ef4ea, + 0x49e9f9a3, 0xde41ee78, 0x5326e331}, + {0x00000000, 0x780d281b, 0xf01a5036, 0x8817782d, 0xe035a06c, 0x98388877, 0x102ff05a, 0x6822d841, 0xc06b40d9, 0xb86668c2, 0x307110ef, + 0x487c38f4, 0x205ee0b5, 0x5853c8ae, 0xd044b083, 0xa8499898, 0x37ca41b6, 0x4fc769ad, 0xc7d01180, 0xbfdd399b, 0xd7ffe1da, 0xaff2c9c1, + 0x27e5b1ec, 0x5fe899f7, 0xf7a1016f, 0x8fac2974, 0x07bb5159, 0x7fb67942, 0x1794a103, 0x6f998918, 0xe78ef135, 0x9f83d92e, 0xd9894268, + 0xa1846a73, 0x2993125e, 0x519e3a45, 0x39bce204, 0x41b1ca1f, 0xc9a6b232, 0xb1ab9a29, 0x19e202b1, 0x61ef2aaa, 0xe9f85287, 0x91f57a9c, + 0xf9d7a2dd, 0x81da8ac6, 0x09cdf2eb, 0x71c0daf0, 0xee4303de, 0x964e2bc5, 0x1e5953e8, 0x66547bf3, 0x0e76a3b2, 0x767b8ba9, 0xfe6cf384, + 0x8661db9f, 0x2e284307, 0x56256b1c, 0xde321331, 0xa63f3b2a, 0xce1de36b, 0xb610cb70, 0x3e07b35d, 0x460a9b46, 0xb21385d0, 0xca1eadcb, + 0x4209d5e6, 0x3a04fdfd, 0x522625bc, 0x2a2b0da7, 0xa23c758a, 0xda315d91, 0x7278c509, 0x0a75ed12, 0x8262953f, 0xfa6fbd24, 0x924d6565, + 0xea404d7e, 0x62573553, 0x1a5a1d48, 0x85d9c466, 0xfdd4ec7d, 0x75c39450, 0x0dcebc4b, 0x65ec640a, 0x1de14c11, 0x95f6343c, 0xedfb1c27, + 0x45b284bf, 0x3dbfaca4, 0xb5a8d489, 0xcda5fc92, 0xa58724d3, 0xdd8a0cc8, 0x559d74e5, 0x2d905cfe, 0x6b9ac7b8, 0x1397efa3, 0x9b80978e, + 0xe38dbf95, 0x8baf67d4, 0xf3a24fcf, 0x7bb537e2, 0x03b81ff9, 0xabf18761, 0xd3fcaf7a, 0x5bebd757, 0x23e6ff4c, 0x4bc4270d, 0x33c90f16, + 0xbbde773b, 0xc3d35f20, 0x5c50860e, 0x245dae15, 0xac4ad638, 0xd447fe23, 0xbc652662, 0xc4680e79, 0x4c7f7654, 0x34725e4f, 0x9c3bc6d7, + 0xe436eecc, 0x6c2196e1, 0x142cbefa, 0x7c0e66bb, 0x04034ea0, 0x8c14368d, 0xf4191e96, 0xd33acba5, 0xab37e3be, 0x23209b93, 0x5b2db388, + 0x330f6bc9, 0x4b0243d2, 0xc3153bff, 0xbb1813e4, 0x13518b7c, 0x6b5ca367, 0xe34bdb4a, 0x9b46f351, 0xf3642b10, 0x8b69030b, 0x037e7b26, + 0x7b73533d, 0xe4f08a13, 0x9cfda208, 0x14eada25, 0x6ce7f23e, 0x04c52a7f, 0x7cc80264, 0xf4df7a49, 0x8cd25252, 0x249bcaca, 0x5c96e2d1, + 0xd4819afc, 0xac8cb2e7, 0xc4ae6aa6, 0xbca342bd, 0x34b43a90, 0x4cb9128b, 0x0ab389cd, 0x72bea1d6, 0xfaa9d9fb, 0x82a4f1e0, 0xea8629a1, + 0x928b01ba, 0x1a9c7997, 0x6291518c, 0xcad8c914, 0xb2d5e10f, 0x3ac29922, 0x42cfb139, 0x2aed6978, 0x52e04163, 0xdaf7394e, 0xa2fa1155, + 0x3d79c87b, 0x4574e060, 0xcd63984d, 0xb56eb056, 0xdd4c6817, 0xa541400c, 0x2d563821, 0x555b103a, 0xfd1288a2, 0x851fa0b9, 0x0d08d894, + 0x7505f08f, 0x1d2728ce, 0x652a00d5, 0xed3d78f8, 0x953050e3, 0x61294e75, 0x1924666e, 0x91331e43, 0xe93e3658, 0x811cee19, 0xf911c602, + 0x7106be2f, 0x090b9634, 0xa1420eac, 0xd94f26b7, 0x51585e9a, 0x29557681, 0x4177aec0, 0x397a86db, 0xb16dfef6, 0xc960d6ed, 0x56e30fc3, + 0x2eee27d8, 0xa6f95ff5, 0xdef477ee, 0xb6d6afaf, 0xcedb87b4, 0x46ccff99, 0x3ec1d782, 0x96884f1a, 0xee856701, 0x66921f2c, 0x1e9f3737, + 0x76bdef76, 0x0eb0c76d, 0x86a7bf40, 0xfeaa975b, 0xb8a00c1d, 0xc0ad2406, 0x48ba5c2b, 0x30b77430, 0x5895ac71, 0x2098846a, 0xa88ffc47, + 0xd082d45c, 0x78cb4cc4, 0x00c664df, 0x88d11cf2, 0xf0dc34e9, 0x98feeca8, 0xe0f3c4b3, 0x68e4bc9e, 0x10e99485, 0x8f6a4dab, 0xf76765b0, + 0x7f701d9d, 0x077d3586, 0x6f5fedc7, 0x1752c5dc, 0x9f45bdf1, 0xe74895ea, 0x4f010d72, 0x370c2569, 0xbf1b5d44, 0xc716755f, 0xaf34ad1e, + 0xd7398505, 0x5f2efd28, 0x2723d533}, + {0x00000000, 0x1168574f, 0x22d0ae9e, 0x33b8f9d1, 0xf3bd9c39, 0xe2d5cb76, 0xd16d32a7, 0xc00565e8, 0xe67b3973, 0xf7136e3c, 0xc4ab97ed, + 0xd5c3c0a2, 0x15c6a54a, 0x04aef205, 0x37160bd4, 0x267e5c9b, 0xccf772e6, 0xdd9f25a9, 0xee27dc78, 0xff4f8b37, 0x3f4aeedf, 0x2e22b990, + 0x1d9a4041, 0x0cf2170e, 0x2a8c4b95, 0x3be41cda, 0x085ce50b, 0x1934b244, 0xd931d7ac, 0xc85980e3, 0xfbe17932, 0xea892e7d, 0x2ff224c8, + 0x3e9a7387, 0x0d228a56, 0x1c4add19, 0xdc4fb8f1, 0xcd27efbe, 0xfe9f166f, 0xeff74120, 0xc9891dbb, 0xd8e14af4, 0xeb59b325, 0xfa31e46a, + 0x3a348182, 0x2b5cd6cd, 0x18e42f1c, 0x098c7853, 0xe305562e, 0xf26d0161, 0xc1d5f8b0, 0xd0bdafff, 0x10b8ca17, 0x01d09d58, 0x32686489, + 0x230033c6, 0x057e6f5d, 0x14163812, 0x27aec1c3, 0x36c6968c, 0xf6c3f364, 0xe7aba42b, 0xd4135dfa, 0xc57b0ab5, 0xe9f98894, 0xf891dfdb, + 0xcb29260a, 0xda417145, 0x1a4414ad, 0x0b2c43e2, 0x3894ba33, 0x29fced7c, 0x0f82b1e7, 0x1eeae6a8, 0x2d521f79, 0x3c3a4836, 0xfc3f2dde, + 0xed577a91, 0xdeef8340, 0xcf87d40f, 0x250efa72, 0x3466ad3d, 0x07de54ec, 0x16b603a3, 0xd6b3664b, 0xc7db3104, 0xf463c8d5, 0xe50b9f9a, + 0xc375c301, 0xd21d944e, 0xe1a56d9f, 0xf0cd3ad0, 0x30c85f38, 0x21a00877, 0x1218f1a6, 0x0370a6e9, 0xc60bac5c, 0xd763fb13, 0xe4db02c2, + 0xf5b3558d, 0x35b63065, 0x24de672a, 0x17669efb, 0x060ec9b4, 0x2070952f, 0x3118c260, 0x02a03bb1, 0x13c86cfe, 0xd3cd0916, 0xc2a55e59, + 0xf11da788, 0xe075f0c7, 0x0afcdeba, 0x1b9489f5, 0x282c7024, 0x3944276b, 0xf9414283, 0xe82915cc, 0xdb91ec1d, 0xcaf9bb52, 0xec87e7c9, + 0xfdefb086, 0xce574957, 0xdf3f1e18, 0x1f3a7bf0, 0x0e522cbf, 0x3dead56e, 0x2c828221, 0x65eed02d, 0x74868762, 0x473e7eb3, 0x565629fc, + 0x96534c14, 0x873b1b5b, 0xb483e28a, 0xa5ebb5c5, 0x8395e95e, 0x92fdbe11, 0xa14547c0, 0xb02d108f, 0x70287567, 0x61402228, 0x52f8dbf9, + 0x43908cb6, 0xa919a2cb, 0xb871f584, 0x8bc90c55, 0x9aa15b1a, 0x5aa43ef2, 0x4bcc69bd, 0x7874906c, 0x691cc723, 0x4f629bb8, 0x5e0accf7, + 0x6db23526, 0x7cda6269, 0xbcdf0781, 0xadb750ce, 0x9e0fa91f, 0x8f67fe50, 0x4a1cf4e5, 0x5b74a3aa, 0x68cc5a7b, 0x79a40d34, 0xb9a168dc, + 0xa8c93f93, 0x9b71c642, 0x8a19910d, 0xac67cd96, 0xbd0f9ad9, 0x8eb76308, 0x9fdf3447, 0x5fda51af, 0x4eb206e0, 0x7d0aff31, 0x6c62a87e, + 0x86eb8603, 0x9783d14c, 0xa43b289d, 0xb5537fd2, 0x75561a3a, 0x643e4d75, 0x5786b4a4, 0x46eee3eb, 0x6090bf70, 0x71f8e83f, 0x424011ee, + 0x532846a1, 0x932d2349, 0x82457406, 0xb1fd8dd7, 0xa095da98, 0x8c1758b9, 0x9d7f0ff6, 0xaec7f627, 0xbfafa168, 0x7faac480, 0x6ec293cf, + 0x5d7a6a1e, 0x4c123d51, 0x6a6c61ca, 0x7b043685, 0x48bccf54, 0x59d4981b, 0x99d1fdf3, 0x88b9aabc, 0xbb01536d, 0xaa690422, 0x40e02a5f, + 0x51887d10, 0x623084c1, 0x7358d38e, 0xb35db666, 0xa235e129, 0x918d18f8, 0x80e54fb7, 0xa69b132c, 0xb7f34463, 0x844bbdb2, 0x9523eafd, + 0x55268f15, 0x444ed85a, 0x77f6218b, 0x669e76c4, 0xa3e57c71, 0xb28d2b3e, 0x8135d2ef, 0x905d85a0, 0x5058e048, 0x4130b707, 0x72884ed6, + 0x63e01999, 0x459e4502, 0x54f6124d, 0x674eeb9c, 0x7626bcd3, 0xb623d93b, 0xa74b8e74, 0x94f377a5, 0x859b20ea, 0x6f120e97, 0x7e7a59d8, + 0x4dc2a009, 0x5caaf746, 0x9caf92ae, 0x8dc7c5e1, 0xbe7f3c30, 0xaf176b7f, 0x896937e4, 0x980160ab, 0xabb9997a, 0xbad1ce35, 0x7ad4abdd, + 0x6bbcfc92, 0x58040543, 0x496c520c}, + {0x00000000, 0xcadca15b, 0x94b943b7, 0x5e65e2ec, 0x9f6e466a, 0x55b2e731, 0x0bd705dd, 0xc10ba486, 0x3edd8cd4, 0xf4012d8f, 0xaa64cf63, + 0x60b86e38, 0xa1b3cabe, 0x6b6f6be5, 0x350a8909, 0xffd62852, 0xcba7d8ad, 0x017b79f6, 0x5f1e9b1a, 0x95c23a41, 0x54c99ec7, 0x9e153f9c, + 0xc070dd70, 0x0aac7c2b, 0xf57a5479, 0x3fa6f522, 0x61c317ce, 0xab1fb695, 0x6a141213, 0xa0c8b348, 0xfead51a4, 0x3471f0ff, 0x2152705f, + 0xeb8ed104, 0xb5eb33e8, 0x7f3792b3, 0xbe3c3635, 0x74e0976e, 0x2a857582, 0xe059d4d9, 0x1f8ffc8b, 0xd5535dd0, 0x8b36bf3c, 0x41ea1e67, + 0x80e1bae1, 0x4a3d1bba, 0x1458f956, 0xde84580d, 0xeaf5a8f2, 0x202909a9, 0x7e4ceb45, 0xb4904a1e, 0x759bee98, 0xbf474fc3, 0xe122ad2f, + 0x2bfe0c74, 0xd4282426, 0x1ef4857d, 0x40916791, 0x8a4dc6ca, 0x4b46624c, 0x819ac317, 0xdfff21fb, 0x152380a0, 0x42a4e0be, 0x887841e5, + 0xd61da309, 0x1cc10252, 0xddcaa6d4, 0x1716078f, 0x4973e563, 0x83af4438, 0x7c796c6a, 0xb6a5cd31, 0xe8c02fdd, 0x221c8e86, 0xe3172a00, + 0x29cb8b5b, 0x77ae69b7, 0xbd72c8ec, 0x89033813, 0x43df9948, 0x1dba7ba4, 0xd766daff, 0x166d7e79, 0xdcb1df22, 0x82d43dce, 0x48089c95, + 0xb7deb4c7, 0x7d02159c, 0x2367f770, 0xe9bb562b, 0x28b0f2ad, 0xe26c53f6, 0xbc09b11a, 0x76d51041, 0x63f690e1, 0xa92a31ba, 0xf74fd356, + 0x3d93720d, 0xfc98d68b, 0x364477d0, 0x6821953c, 0xa2fd3467, 0x5d2b1c35, 0x97f7bd6e, 0xc9925f82, 0x034efed9, 0xc2455a5f, 0x0899fb04, + 0x56fc19e8, 0x9c20b8b3, 0xa851484c, 0x628de917, 0x3ce80bfb, 0xf634aaa0, 0x373f0e26, 0xfde3af7d, 0xa3864d91, 0x695aecca, 0x968cc498, + 0x5c5065c3, 0x0235872f, 0xc8e92674, 0x09e282f2, 0xc33e23a9, 0x9d5bc145, 0x5787601e, 0x33550079, 0xf989a122, 0xa7ec43ce, 0x6d30e295, + 0xac3b4613, 0x66e7e748, 0x388205a4, 0xf25ea4ff, 0x0d888cad, 0xc7542df6, 0x9931cf1a, 0x53ed6e41, 0x92e6cac7, 0x583a6b9c, 0x065f8970, + 0xcc83282b, 0xf8f2d8d4, 0x322e798f, 0x6c4b9b63, 0xa6973a38, 0x679c9ebe, 0xad403fe5, 0xf325dd09, 0x39f97c52, 0xc62f5400, 0x0cf3f55b, + 0x529617b7, 0x984ab6ec, 0x5941126a, 0x939db331, 0xcdf851dd, 0x0724f086, 0x12077026, 0xd8dbd17d, 0x86be3391, 0x4c6292ca, 0x8d69364c, + 0x47b59717, 0x19d075fb, 0xd30cd4a0, 0x2cdafcf2, 0xe6065da9, 0xb863bf45, 0x72bf1e1e, 0xb3b4ba98, 0x79681bc3, 0x270df92f, 0xedd15874, + 0xd9a0a88b, 0x137c09d0, 0x4d19eb3c, 0x87c54a67, 0x46ceeee1, 0x8c124fba, 0xd277ad56, 0x18ab0c0d, 0xe77d245f, 0x2da18504, 0x73c467e8, + 0xb918c6b3, 0x78136235, 0xb2cfc36e, 0xecaa2182, 0x267680d9, 0x71f1e0c7, 0xbb2d419c, 0xe548a370, 0x2f94022b, 0xee9fa6ad, 0x244307f6, + 0x7a26e51a, 0xb0fa4441, 0x4f2c6c13, 0x85f0cd48, 0xdb952fa4, 0x11498eff, 0xd0422a79, 0x1a9e8b22, 0x44fb69ce, 0x8e27c895, 0xba56386a, + 0x708a9931, 0x2eef7bdd, 0xe433da86, 0x25387e00, 0xefe4df5b, 0xb1813db7, 0x7b5d9cec, 0x848bb4be, 0x4e5715e5, 0x1032f709, 0xdaee5652, + 0x1be5f2d4, 0xd139538f, 0x8f5cb163, 0x45801038, 0x50a39098, 0x9a7f31c3, 0xc41ad32f, 0x0ec67274, 0xcfcdd6f2, 0x051177a9, 0x5b749545, + 0x91a8341e, 0x6e7e1c4c, 0xa4a2bd17, 0xfac75ffb, 0x301bfea0, 0xf1105a26, 0x3bccfb7d, 0x65a91991, 0xaf75b8ca, 0x9b044835, 0x51d8e96e, + 0x0fbd0b82, 0xc561aad9, 0x046a0e5f, 0xceb6af04, 0x90d34de8, 0x5a0fecb3, 0xa5d9c4e1, 0x6f0565ba, 0x31608756, 0xfbbc260d, 0x3ab7828b, + 0xf06b23d0, 0xae0ec13c, 0x64d26067}}; + +static inline uint32_t +StrCrc(const char* Data) +{ + uint32_t CRC = 0xFFFFFFFF; + while (*Data) + { + char16_t C = *Data++; + int32_t CL = (C & 255); + CRC = (CRC << 8) ^ CRCTable[(CRC >> 24) ^ CL]; + int32_t CH = (C >> 8) & 255; + CRC = (CRC << 8) ^ CRCTable[(CRC >> 24) ^ CH]; + } + return ~CRC; +} + +#define BYTESWAP_ORDER32(x) (((x) >> 24) + (((x) >> 8) & 0xff00) + (((x) << 8) & 0xff0000) + ((x) << 24)) +#define UE_PTRDIFF_TO_INT32(argument) static_cast<int32_t>(argument) + +template<typename T> +constexpr T +Align(T Val, uint64_t Alignment) +{ + return (T)(((uint64_t)Val + Alignment - 1) & ~(Alignment - 1)); +} + +static uint32_t +MemCRC(const void* InData, size_t Length, uint32_t CRC = 0) +{ + // Based on the Slicing-by-8 implementation found here: + // http://slicing-by-8.sourceforge.net/ + + CRC = ~BYTESWAP_ORDER32(CRC); + + const uint8_t* __restrict Data = (uint8_t*)InData; + + // First we need to align to 32-bits + int32_t InitBytes = UE_PTRDIFF_TO_INT32(Align(Data, 4) - Data); + + if (Length > InitBytes) + { + Length -= InitBytes; + + for (; InitBytes; --InitBytes) + { + CRC = (CRC >> 8) ^ CRCTablesSB8[0][(CRC & 0xFF) ^ *Data++]; + } + + auto Data4 = (const uint32_t*)Data; + for (size_t Repeat = Length / 8; Repeat; --Repeat) + { + uint32_t V1 = *Data4++ ^ CRC; + uint32_t V2 = *Data4++; + CRC = CRCTablesSB8[7][V1 & 0xFF] ^ CRCTablesSB8[6][(V1 >> 8) & 0xFF] ^ CRCTablesSB8[5][(V1 >> 16) & 0xFF] ^ + CRCTablesSB8[4][V1 >> 24] ^ CRCTablesSB8[3][V2 & 0xFF] ^ CRCTablesSB8[2][(V2 >> 8) & 0xFF] ^ + CRCTablesSB8[1][(V2 >> 16) & 0xFF] ^ CRCTablesSB8[0][V2 >> 24]; + } + Data = (const uint8_t*)Data4; + + Length %= 8; + } + + for (; Length; --Length) + { + CRC = (CRC >> 8) ^ CRCTablesSB8[0][(CRC & 0xFF) ^ *Data++]; + } + + return BYTESWAP_ORDER32(~CRC); +} + +struct CorruptionTrailer +{ + enum + { + /** Arbitrary number used to identify corruption **/ + MagicConstant = 0x1e873d89 + }; + + uint32_t Magic = MagicConstant; + uint32_t Version = 1; + uint32_t CRCofPayload = 0; + uint32_t SizeOfPayload = 0; + + void Initialize(const void* Data, size_t Size) + { + CRCofPayload = MemCRC(Data, Size); + SizeOfPayload = (uint32_t)Size; + } +}; + +std::wstring +GenerateDdcPath(std::string_view Key, std::filesystem::path& rootDir) +{ + std::filesystem::path FilePath = rootDir; + + std::string k8{Key}; + for (auto& c : k8) + c = (char)toupper(c); + + const uint32_t Hash = StrCrc(k8.c_str()); + + std::wstring DirName; + + DirName = u'0' + ((Hash / 100) % 10); + FilePath /= DirName; + DirName = u'0' + ((Hash / 10) % 10); + FilePath /= DirName; + DirName = u'0' + (Hash % 10); + FilePath /= DirName; + + FilePath /= Key; + + auto NativePath = FilePath.native(); + NativePath.append(L".udd"); + + return NativePath; +} + +} // namespace UE + +////////////////////////////////////////////////////////////////////////// + +FileCacheStore::FileCacheStore(const char* RootDir, const char* ReadRootDir) +{ + // Ensure root directory exists - create if it doesn't exist already + + spdlog::info("Initializing FileCacheStore at '{}'", std::string_view(RootDir)); + + m_RootDir = RootDir; + + std::error_code ErrorCode; + + std::filesystem::create_directories(m_RootDir, ErrorCode); + + if (ErrorCode) + { + ExtendableStringBuilder<256> Name; + WideToUtf8(m_RootDir.c_str(), Name); + + spdlog::error("Could not open file cache directory '{}' for writing ({})", Name.c_str(), ErrorCode.message()); + + m_IsOk = false; + } + + if (ReadRootDir) + { + m_ReadRootDir = ReadRootDir; + + if (std::filesystem::exists(m_ReadRootDir, ErrorCode)) + { + spdlog::info("FileCacheStore will use additional read tree at '{}'", std::string_view(ReadRootDir)); + + m_ReadRootIsValid = true; + } + } +} + +FileCacheStore::~FileCacheStore() +{ +} + +bool +FileCacheStore::Get(std::string_view Key, CacheValue& OutValue) +{ + CAtlFile File; + + std::wstring nativePath; + + HRESULT hRes = E_FAIL; + + if (m_ReadRootDir.empty() == false) + { + nativePath = UE::GenerateDdcPath(Key, m_ReadRootDir); + + hRes = File.Create(nativePath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + } + + if (FAILED(hRes)) + { + nativePath = UE::GenerateDdcPath(Key, m_RootDir); + + hRes = File.Create(nativePath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + } + + if (FAILED(hRes)) + { + spdlog::debug("GET MISS {}", Key); + + return false; + } + + ULONGLONG FileSize; + File.GetSize(FileSize); + + if (FileSize <= 16) + { + return false; + } + + FileSize -= 16; // CorruptionWrapper trailer + + IoBuffer Value(FileSize); + + uint8_t* ReadPointer = (uint8_t*)Value.Data(); + + while (FileSize) + { + const int MaxChunkSize = 16 * 1024 * 1024; + const int ChunkSize = gsl::narrow_cast<int>((FileSize > MaxChunkSize) ? MaxChunkSize : FileSize); + + DWORD BytesRead = 0; + hRes = File.Read(ReadPointer, ChunkSize, BytesRead); + + if (FAILED(hRes)) + { + return false; + } + + ReadPointer += BytesRead; + FileSize -= BytesRead; + } + + OutValue.Value = std::move(Value); + + spdlog::debug("GET HIT {}", Key); + + return true; +} + +void +FileCacheStore::Put(std::string_view Key, const CacheValue& Value) +{ + const void* Data = Value.Value.Data(); + size_t Size = Value.Value.Size(); + + UE::CorruptionTrailer Trailer; + Trailer.Initialize(Data, Size); + + std::wstring nativePath = UE::GenerateDdcPath(Key, m_RootDir); + + CAtlTemporaryFile File; + + spdlog::debug("PUT {}", Key); + + HRESULT hRes = File.Create(m_RootDir.c_str()); + + if (SUCCEEDED(hRes)) + { + const uint8_t* WritePointer = (const uint8_t*)Data; + + while (Size) + { + const int MaxChunkSize = 16 * 1024 * 1024; + const int ChunkSize = (int)((Size > MaxChunkSize) ? MaxChunkSize : Size); + + DWORD BytesWritten = 0; + File.Write(WritePointer, ChunkSize, &BytesWritten); + + Size -= BytesWritten; + WritePointer += BytesWritten; + } + + File.Write(&Trailer, sizeof Trailer); + hRes = File.Close(nativePath.c_str()); // This renames the file to its final name + + if (FAILED(hRes)) + { + spdlog::warn("Failed to rename temp file for key '{}' - deleting temporary file", Key); + + if (!DeleteFile(File.TempFileName())) + { + spdlog::warn("Temp file for key '{}' could not be deleted - no value persisted", Key); + } + } + } +} + +////////////////////////////////////////////////////////////////////////// + +MemoryCacheStore::MemoryCacheStore() +{ +} + +MemoryCacheStore::~MemoryCacheStore() +{ +} + +bool +MemoryCacheStore::Get(std::string_view InKey, CacheValue& OutValue) +{ + RwLock::SharedLockScope _(m_Lock); + + auto it = m_CacheMap.find(std::string(InKey)); + + if (it == m_CacheMap.end()) + { + return false; + } + else + { + OutValue.Value = it->second; + + return true; + } +} + +void +MemoryCacheStore::Put(std::string_view Key, const CacheValue& Value) +{ + RwLock::ExclusiveLockScope _(m_Lock); + m_CacheMap[std::string(Key)] = Value.Value; +} + +////////////////////////////////////////////////////////////////////////// + +ZenCacheStore::ZenCacheStore(zen::CasStore& Cas, const std::filesystem::path& RootDir) : m_DiskLayer{Cas, RootDir} +{ + zen::CreateDirectories(RootDir); +} + +ZenCacheStore::~ZenCacheStore() +{ +} + +bool +ZenCacheStore::Get(std::string_view InBucket, const zen::IoHash& HashKey, CacheValue& OutValue) +{ + bool Ok = m_MemLayer.Get(InBucket, HashKey, OutValue); + + if (!Ok) + { + Ok = m_DiskLayer.Get(InBucket, HashKey, OutValue); + +#if 0 // This would keep file handles open + if (ok) + { + m_memLayer.Put(InBucket, HashKey, OutValue); + } +#endif + } + + return Ok; +} + +void +ZenCacheStore::Put(std::string_view InBucket, const zen::IoHash& HashKey, const CacheValue& Value) +{ + m_MemLayer.Put(InBucket, HashKey, Value); + m_DiskLayer.Put(InBucket, HashKey, Value); +} + +////////////////////////////////////////////////////////////////////////// + +ZenCacheMemoryLayer::ZenCacheMemoryLayer() +{ +} + +ZenCacheMemoryLayer::~ZenCacheMemoryLayer() +{ +} + +bool +ZenCacheMemoryLayer::Get(std::string_view InBucket, const zen::IoHash& HashKey, CacheValue& OutValue) +{ + CacheBucket* Bucket = nullptr; + + { + RwLock::SharedLockScope _(m_Lock); + + auto it = m_Buckets.find(std::string(InBucket)); + + if (it != m_Buckets.end()) + { + Bucket = &it->second; + } + } + + if (Bucket == nullptr) + return false; + + ZEN_ASSERT(Bucket != nullptr); + + return Bucket->Get(HashKey, OutValue); +} + +void +ZenCacheMemoryLayer::Put(std::string_view InBucket, const zen::IoHash& HashKey, const CacheValue& Value) +{ + CacheBucket* Bucket = nullptr; + + { + RwLock::SharedLockScope _(m_Lock); + + auto it = m_Buckets.find(std::string(InBucket)); + + if (it != m_Buckets.end()) + { + Bucket = &it->second; + } + } + + if (Bucket == nullptr) + { + // New bucket + + RwLock::ExclusiveLockScope _(m_Lock); + + Bucket = &m_Buckets[std::string(InBucket)]; + } + + ZEN_ASSERT(Bucket != nullptr); + + Bucket->Put(HashKey, Value); +} + +bool +ZenCacheMemoryLayer::CacheBucket::Get(const zen::IoHash& HashKey, CacheValue& OutValue) +{ + RwLock::SharedLockScope _(m_bucketLock); + + auto bucketIt = m_cacheMap.find(HashKey); + + if (bucketIt == m_cacheMap.end()) + { + return false; + } + + OutValue.Value = bucketIt->second; + + return true; +} + +void +ZenCacheMemoryLayer::CacheBucket::Put(const zen::IoHash& HashKey, const CacheValue& Value) +{ + RwLock::ExclusiveLockScope _(m_bucketLock); + + m_cacheMap[HashKey] = Value.Value; +} + +////////////////////////////////////////////////////////////////////////// + +class ZenFile +{ +public: + void Open(std::filesystem::path FileName, bool IsCreate); + void Read(void* Data, uint64_t Size, uint64_t Offset); + void Write(const void* Data, uint64_t Size, uint64_t Offset); + void Flush(); + void* Handle() { return m_File; } + +private: + CAtlFile m_File; +}; + +void +ZenFile::Open(std::filesystem::path FileName, bool isCreate) +{ + const DWORD dwCreationDisposition = isCreate ? CREATE_ALWAYS : OPEN_EXISTING; + + HRESULT hRes = m_File.Create(FileName.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open bucket sobs file"); + } +} + +void +ZenFile::Read(void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED Ovl{}; + + Ovl.Offset = DWORD(Offset & 0xffff'ffffu); + Ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_File.Read(Data, gsl::narrow<DWORD>(Size), &Ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to read from file" /* TODO: add context */); + } +} + +void +ZenFile::Write(const void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED Ovl{}; + + Ovl.Offset = DWORD(Offset & 0xffff'ffffu); + Ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_File.Write(Data, gsl::narrow<DWORD>(Size), &Ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write to file" /* TODO: add context */); + } +} + +void +ZenFile::Flush() +{ + m_File.Flush(); +} + +////////////////////////////////////////////////////////////////////////// + +class ZenLogFile +{ +public: + ZenLogFile(); + ~ZenLogFile(); + + void Open(std::filesystem::path FileName, size_t RecordSize, bool isCreate); + void Append(const void* DataPointer, uint64_t DataSize); + void Replay(std::function<void(const void*)>&& Handler); + void Flush(); + +private: + CAtlFile m_File; + size_t m_RecordSize = 1; + uint64_t m_AppendOffset = 0; +}; + +ZenLogFile::ZenLogFile() +{ +} + +ZenLogFile::~ZenLogFile() +{ +} + +void +ZenLogFile::Open(std::filesystem::path FileName, size_t RecordSize, bool IsCreate) +{ + m_RecordSize = RecordSize; + + const DWORD dwCreationDisposition = IsCreate ? CREATE_ALWAYS : OPEN_EXISTING; + + HRESULT hRes = m_File.Create(FileName.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open log file" /* TODO: add path */); + } + + // TODO: write/validate header and log contents and prepare for appending/replay +} + +void +ZenLogFile::Replay(std::function<void(const void*)>&& Handler) +{ + std::vector<uint8_t> ReadBuffer; + + ULONGLONG LogFileSize; + m_File.GetSize(LogFileSize); + + // Ensure we end up on a clean boundary + LogFileSize -= LogFileSize % m_RecordSize; + + ReadBuffer.resize(LogFileSize); + + HRESULT hRes = m_File.Read(ReadBuffer.data(), gsl::narrow<DWORD>(ReadBuffer.size())); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to read log file" /* TODO: add context */); + } + + const size_t EntryCount = LogFileSize / m_RecordSize; + + for (int i = 0; i < EntryCount; ++i) + { + Handler(ReadBuffer.data() + (i * m_RecordSize)); + } +} + +void +ZenLogFile::Append(const void* DataPointer, uint64_t DataSize) +{ + HRESULT hRes = m_File.Write(DataPointer, gsl::narrow<DWORD>(DataSize)); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write to log file" /* TODO: add context */); + } +} + +void +ZenLogFile::Flush() +{ +} + +template<typename T> +class TZenLogFile : public ZenLogFile +{ +public: + void Replay(std::function<void(const T&)>&& Handler) + { + ZenLogFile::Replay([&](const void* VoidPtr) { + const T& Record = *reinterpret_cast<const T*>(VoidPtr); + + Handler(Record); + }); + } + + void Append(const T& Record) { ZenLogFile::Append(&Record, sizeof Record); } +}; + +////////////////////////////////////////////////////////////////////////// + +#pragma pack(push) +#pragma pack(1) + +struct DiskLocation +{ + uint64_t Offset; + uint32_t Size; +}; + +struct DiskIndexEntry +{ + zen::IoHash Key; + DiskLocation Location; +}; + +#pragma pack(pop) + +static_assert(sizeof(DiskIndexEntry) == 32); + +struct ZenCacheDiskLayer::CacheBucket +{ + CacheBucket(CasStore& Cas); + ~CacheBucket(); + + void OpenOrCreate(std::filesystem::path BucketDir); + + bool Get(const zen::IoHash& HashKey, CacheValue& OutValue); + void Put(const zen::IoHash& HashKey, const CacheValue& Value); + void Flush(); + + void PutLargeObject(const zen::IoHash& HashKey, const CacheValue& Value); + + inline bool IsOk() const { return m_Ok; } + + CasStore& m_CasStore; + std::filesystem::path m_BucketDir; + Oid m_BucketId; + bool m_Ok = false; + uint64_t m_LargeObjectThreshold = 1024; + + ZenFile m_SobsFile; + TZenLogFile<DiskIndexEntry> m_SlogFile; + ZenFile m_SidxFile; + + void BuildPath(zen::WideStringBuilderBase& Path, const zen::IoHash& HashKey); + + RwLock m_IndexLock; + std::unordered_map<zen::IoHash, DiskLocation, zen::IoHash::Hasher> m_Index; + uint64_t m_WriteCursor = 0; +}; + +ZenCacheDiskLayer::CacheBucket::CacheBucket(CasStore& Cas) : m_CasStore(Cas) +{ +} + +ZenCacheDiskLayer::CacheBucket::~CacheBucket() +{ +} + +void +ZenCacheDiskLayer::CacheBucket::OpenOrCreate(std::filesystem::path BucketDir) +{ + std::filesystem::create_directories(BucketDir); + + m_BucketDir = BucketDir; + + std::wstring ManifestPath{(m_BucketDir / "zen_manifest").c_str()}; + std::wstring SobsPath{(m_BucketDir / "zen.sobs").c_str()}; + std::wstring SlogPath{(m_BucketDir / "zen.slog").c_str()}; + std::wstring SidxPath{(m_BucketDir / "zen.sidx").c_str()}; + + CAtlFile ManifestFile; + + // Try opening existing file first + + bool IsNew = false; + + HRESULT hRes = ManifestFile.Create(ManifestPath.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, OPEN_EXISTING); + + if (SUCCEEDED(hRes)) + { + ULONGLONG FileSize; + ManifestFile.GetSize(FileSize); + + if (FileSize == sizeof(Oid)) + { + hRes = ManifestFile.Read(&m_BucketId, sizeof(Oid)); + + if (SUCCEEDED(hRes)) + { + m_Ok = true; + } + } + + if (!m_Ok) + ManifestFile.Close(); + } + + if (!m_Ok) + { + // This is a new bucket + + hRes = ManifestFile.Create(ManifestPath.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, CREATE_ALWAYS); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to create bucket manifest"); + } + + m_BucketId.Generate(); + + hRes = ManifestFile.Write(&m_BucketId, sizeof(Oid)); + + IsNew = true; + } + + // Initialize small object storage related files + + m_SobsFile.Open(SobsPath, IsNew); + m_SidxFile.Open(SidxPath, IsNew); + + // Open and replay log + + m_SlogFile.Open(SlogPath, sizeof(DiskIndexEntry), IsNew); + + uint64_t maxFileOffset = 0; + + { + // This is not technically necessary but may help future static analysis + zen::RwLock::ExclusiveLockScope _(m_IndexLock); + + m_SlogFile.Replay([&](const DiskIndexEntry& Record) { + m_Index[Record.Key] = Record.Location; + + maxFileOffset = std::max<uint64_t>(maxFileOffset, Record.Location.Offset + Record.Location.Size); + }); + } + + m_WriteCursor = (maxFileOffset + 15) & ~15; + + m_Ok = true; +} + +void +ZenCacheDiskLayer::CacheBucket::BuildPath(zen::WideStringBuilderBase& Path, const zen::IoHash& HashKey) +{ + char hex[sizeof(HashKey.Hash) * 2]; + ToHexBytes(HashKey.Hash, sizeof HashKey.Hash, hex); + + Path.Append(m_BucketDir.c_str()); + Path.Append(L"/"); + Path.AppendAsciiRange(hex, hex + sizeof(hex)); +} + +bool +ZenCacheDiskLayer::CacheBucket::Get(const zen::IoHash& HashKey, CacheValue& OutValue) +{ + if (!m_Ok) + return false; + + { + zen::RwLock::SharedLockScope _(m_IndexLock); + + auto it = m_Index.find(HashKey); + + if (it != m_Index.end()) + { + OutValue.Value = IoBufferBuilder::MakeFromFileHandle(m_SobsFile.Handle(), it->second.Offset, it->second.Size); + + return true; + } + } + + WideStringBuilder<128> dataFilePath; + BuildPath(dataFilePath, HashKey); + + zen::IoBuffer data = IoBufferBuilder::MakeFromFile(dataFilePath.c_str()); + + if (!data) + { + return false; + } + + OutValue.Value = data; + + // TODO: should populate index? + + return true; +} + +void +ZenCacheDiskLayer::CacheBucket::Put(const zen::IoHash& HashKey, const CacheValue& Value) +{ + if (!m_Ok) + return; + + if (Value.Value.Size() >= m_LargeObjectThreshold) + PutLargeObject(HashKey, Value); + + // Small object put + + zen::RwLock::ExclusiveLockScope _(m_IndexLock); + + auto it = m_Index.find(HashKey); + + DiskLocation loc{.Offset = m_WriteCursor, .Size = gsl::narrow<uint32_t>(Value.Value.Size())}; + + m_WriteCursor = (m_WriteCursor + loc.Size + 15) & ~15; + + if (it == m_Index.end()) + { + m_Index.insert({HashKey, loc}); + } + else + { + // TODO: should check if write is idempotent and bail out if it is? + + it->second = loc; + } + + DiskIndexEntry indexEntry{.Key = HashKey, .Location = loc}; + + m_SlogFile.Append(indexEntry); + + m_SobsFile.Write(Value.Value.Data(), loc.Size, loc.Offset); + + return; +} + +void +ZenCacheDiskLayer::CacheBucket::Flush() +{ + m_SobsFile.Flush(); + m_SidxFile.Flush(); + m_SlogFile.Flush(); +} + +void +ZenCacheDiskLayer::CacheBucket::PutLargeObject(const zen::IoHash& HashKey, const CacheValue& Value) +{ + zen::WideStringBuilder<128> dataFilePath; + + BuildPath(dataFilePath, HashKey); + + CAtlTemporaryFile dataFile; + + HRESULT hRes = dataFile.Create(m_BucketDir.c_str()); + + hRes = dataFile.Write(Value.Value.Data(), gsl::narrow<DWORD>(Value.Value.Size())); + + if (FAILED(hRes)) + { + // TODO: report error! and delete temp file + + return; + } + + hRes = dataFile.Close(dataFilePath.c_str()); +} + +////////////////////////////////////////////////////////////////////////// + +ZenCacheDiskLayer::ZenCacheDiskLayer(CasStore& Cas, const std::filesystem::path& RootDir) : m_RootDir(RootDir), m_CasStore(Cas) +{ +} + +ZenCacheDiskLayer::~ZenCacheDiskLayer() = default; + +bool +ZenCacheDiskLayer::Get(std::string_view InBucket, const zen::IoHash& HashKey, CacheValue& OutValue) +{ + CacheBucket* Bucket = nullptr; + + { + zen::RwLock::SharedLockScope _(m_Lock); + + auto it = m_Buckets.find(std::string(InBucket)); + + if (it != m_Buckets.end()) + { + Bucket = &it->second; + } + } + + if (Bucket == nullptr) + { + // Bucket needs to be opened/created + + zen::RwLock::ExclusiveLockScope _(m_Lock); + + auto It = m_Buckets.try_emplace(std::string(InBucket), m_CasStore); + Bucket = &It.first->second; + + std::filesystem::path BucketPath = m_RootDir; + BucketPath /= std::string(InBucket); + + Bucket->OpenOrCreate(BucketPath.c_str()); + } + + ZEN_ASSERT(Bucket != nullptr); + + return Bucket->Get(HashKey, OutValue); +} + +void +ZenCacheDiskLayer::Put(std::string_view InBucket, const zen::IoHash& HashKey, const CacheValue& Value) +{ + CacheBucket* Bucket = nullptr; + + { + zen::RwLock::SharedLockScope _(m_Lock); + + auto it = m_Buckets.find(std::string(InBucket)); + + if (it != m_Buckets.end()) + { + Bucket = &it->second; + } + } + + if (Bucket == nullptr) + { + // New bucket needs to be created + + zen::RwLock::ExclusiveLockScope _(m_Lock); + + auto It = m_Buckets.try_emplace(std::string(InBucket), m_CasStore); + Bucket = &It.first->second; + + std::filesystem::path bucketPath = m_RootDir; + bucketPath /= std::string(InBucket); + + Bucket->OpenOrCreate(bucketPath.c_str()); + } + + ZEN_ASSERT(Bucket != nullptr); + + if (Bucket->IsOk()) + { + Bucket->Put(HashKey, Value); + } +} + +void +ZenCacheDiskLayer::Flush() +{ + std::vector<CacheBucket*> Buckets; + Buckets.reserve(m_Buckets.size()); + + { + zen::RwLock::SharedLockScope _(m_Lock); + + for (auto& Kv : m_Buckets) + { + Buckets.push_back(&Kv.second); + } + } + + for (auto& Bucket : Buckets) + { + Bucket->Flush(); + } +} + +////////////////////////////////////////////////////////////////////////// + +ZenCacheTracker::ZenCacheTracker(ZenCacheStore& CacheStore) +{ + ZEN_UNUSED(CacheStore); +} + +ZenCacheTracker::~ZenCacheTracker() +{ +} + +void +ZenCacheTracker::TrackAccess(std::string_view Bucket, const zen::IoHash& HashKey) +{ + ZEN_UNUSED(Bucket); + ZEN_UNUSED(HashKey); +} diff --git a/zenserver/cache/cachestore.h b/zenserver/cache/cachestore.h new file mode 100644 index 000000000..1ac01279b --- /dev/null +++ b/zenserver/cache/cachestore.h @@ -0,0 +1,175 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/IoBuffer.h> +#include <zencore/iohash.h> +#include <zencore/thread.h> +#include <zencore/uid.h> +#include <zenstore/cas.h> +#include <compare> +#include <filesystem> +#include <unordered_map> + +namespace zen { + +class WideStringBuilderBase; +class CasStore; + +} // namespace zen + +struct CacheValue +{ + zen::IoBuffer Value; +}; + +/****************************************************************************** + + /$$ /$$/$$ /$$ /$$$$$$ /$$ + | $$ /$$| $$ | $$ /$$__ $$ | $$ + | $$ /$$/| $$ | $$ | $$ \__/ /$$$$$$ /$$$$$$| $$$$$$$ /$$$$$$ + | $$$$$/ | $$ / $$/ | $$ |____ $$/$$_____| $$__ $$/$$__ $$ + | $$ $$ \ $$ $$/ | $$ /$$$$$$| $$ | $$ \ $| $$$$$$$$ + | $$\ $$ \ $$$/ | $$ $$/$$__ $| $$ | $$ | $| $$_____/ + | $$ \ $$ \ $/ | $$$$$$| $$$$$$| $$$$$$| $$ | $| $$$$$$$ + |__/ \__/ \_/ \______/ \_______/\_______|__/ |__/\_______/ + + Basic Key-Value cache. No restrictions on keys, and values are always opaque + binary blobs. + +******************************************************************************/ + +class CacheStore +{ +public: + virtual bool Get(std::string_view Key, CacheValue& OutValue) = 0; + virtual void Put(std::string_view Key, const CacheValue& Value) = 0; +}; + +/** File system based implementation + + Emulates the behaviour of UE4 with regards to file system structure, + and also adds a file corruption trailer to remain compatible with + the file-system based implementation (this should be made configurable) + + */ +class FileCacheStore : public CacheStore +{ +public: + FileCacheStore(const char* RootDir, const char* ReadRootDir = nullptr); + ~FileCacheStore(); + + virtual bool Get(std::string_view Key, CacheValue& OutValue) override; + virtual void Put(std::string_view Key, const CacheValue& Value) override; + +private: + std::filesystem::path m_RootDir; + std::filesystem::path m_ReadRootDir; + bool m_IsOk = true; + bool m_ReadRootIsValid = false; +}; + +class MemoryCacheStore : public CacheStore +{ +public: + MemoryCacheStore(); + ~MemoryCacheStore(); + + virtual bool Get(std::string_view Key, CacheValue& OutValue) override; + virtual void Put(std::string_view Key, const CacheValue& Value) override; + +private: + zen::RwLock m_Lock; + std::unordered_map<std::string, zen::IoBuffer> m_CacheMap; +}; + +/****************************************************************************** + + /$$$$$$$$ /$$$$$$ /$$ + |_____ $$ /$$__ $$ | $$ + /$$/ /$$$$$$ /$$$$$$$ | $$ \__/ /$$$$$$ /$$$$$$| $$$$$$$ /$$$$$$ + /$$/ /$$__ $| $$__ $$ | $$ |____ $$/$$_____| $$__ $$/$$__ $$ + /$$/ | $$$$$$$| $$ \ $$ | $$ /$$$$$$| $$ | $$ \ $| $$$$$$$$ + /$$/ | $$_____| $$ | $$ | $$ $$/$$__ $| $$ | $$ | $| $$_____/ + /$$$$$$$| $$$$$$| $$ | $$ | $$$$$$| $$$$$$| $$$$$$| $$ | $| $$$$$$$ + |________/\_______|__/ |__/ \______/ \_______/\_______|__/ |__/\_______/ + + Cache store for UE5. Restricts keys to "{bucket}/{hash}" pairs where the hash + is 40 (hex) chars in size. Values may be opaque blobs or structured objects + which can in turn contain references to other objects. + +******************************************************************************/ + +class ZenCacheMemoryLayer +{ +public: + ZenCacheMemoryLayer(); + ~ZenCacheMemoryLayer(); + + bool Get(std::string_view Bucket, const zen::IoHash& HashKey, CacheValue& OutValue); + void Put(std::string_view Bucket, const zen::IoHash& HashKey, const CacheValue& Value); + +private: + struct CacheBucket + { + zen::RwLock m_bucketLock; + std::unordered_map<zen::IoHash, zen::IoBuffer, zen::IoHash::Hasher> m_cacheMap; + + bool Get(const zen::IoHash& HashKey, CacheValue& OutValue); + void Put(const zen::IoHash& HashKey, const CacheValue& Value); + }; + + zen::RwLock m_Lock; + std::unordered_map<std::string, CacheBucket> m_Buckets; +}; + +class ZenCacheDiskLayer +{ +public: + ZenCacheDiskLayer(zen::CasStore& Cas, const std::filesystem::path& RootDir); + ~ZenCacheDiskLayer(); + + bool Get(std::string_view Bucket, const zen::IoHash& HashKey, CacheValue& OutValue); + void Put(std::string_view Bucket, const zen::IoHash& HashKey, const CacheValue& Value); + + void Flush(); + +private: + /** A cache bucket manages a single directory containing + metadata and data for that bucket + */ + struct CacheBucket; + + zen::CasStore& m_CasStore; + std::filesystem::path m_RootDir; + zen::RwLock m_Lock; + std::unordered_map<std::string, CacheBucket> m_Buckets; // TODO: make this case insensitive +}; + +class ZenCacheStore +{ +public: + ZenCacheStore(zen::CasStore& Cas, const std::filesystem::path& RootDir); + ~ZenCacheStore(); + + virtual bool Get(std::string_view Bucket, const zen::IoHash& HashKey, CacheValue& OutValue); + virtual void Put(std::string_view Bucket, const zen::IoHash& HashKey, const CacheValue& Value); + +private: + std::filesystem::path m_RootDir; + ZenCacheMemoryLayer m_MemLayer; + ZenCacheDiskLayer m_DiskLayer; +}; + +/** Tracks cache entry access, stats and orchestrates cleanup activities + */ +class ZenCacheTracker +{ +public: + ZenCacheTracker(ZenCacheStore& CacheStore); + ~ZenCacheTracker(); + + void TrackAccess(std::string_view Bucket, const zen::IoHash& HashKey); + +private: +}; diff --git a/zenserver/cache/kvcache.cpp b/zenserver/cache/kvcache.cpp new file mode 100644 index 000000000..404b17e5a --- /dev/null +++ b/zenserver/cache/kvcache.cpp @@ -0,0 +1,208 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include "kvcache.h" + +#include <zencore/httpserver.h> +#include <zencore/memory.h> +#include <zencore/timer.h> +#include "cachestore.h" +#include "upstream/jupiter.h" + +#include <rocksdb/db.h> +#include <spdlog/spdlog.h> + +namespace zen { + +namespace rocksdb = ROCKSDB_NAMESPACE; +using namespace fmt::literals; +using namespace std::literals; + +////////////////////////////////////////////////////////////////////////// + +struct HttpKvCacheService::AccessTracker +{ + AccessTracker(); + ~AccessTracker(); + + void TrackAccess(std::string_view Key); + void Flush(); + +private: + RwLock m_Lock; + ChunkingLinearAllocator m_AccessRecordAllocator{8192}; +}; + +HttpKvCacheService::AccessTracker::AccessTracker() +{ +} + +HttpKvCacheService::AccessTracker::~AccessTracker() +{ + RwLock::ExclusiveLockScope _(m_Lock); +} + +void +HttpKvCacheService::AccessTracker::Flush() +{ + RwLock::ExclusiveLockScope _(m_Lock); + + m_AccessRecordAllocator.Reset(); +} + +void +HttpKvCacheService::AccessTracker::TrackAccess(std::string_view Key) +{ + // Once it matters, this should use a thread-local means of updating this data, + // like Concurrency::combinable or similar + + RwLock::ExclusiveLockScope _(m_Lock); + + const uint64_t KeySize = Key.size(); + void* Ptr = m_AccessRecordAllocator.Alloc(KeySize + 1); + memcpy(Ptr, Key.data(), KeySize); + reinterpret_cast<uint8_t*>(Ptr)[KeySize] = 0; +} + +////////////////////////////////////////////////////////////////////////// + +HttpKvCacheService::HttpKvCacheService() +{ + m_Cloud = new CloudCacheClient("https://jupiter.devtools.epicgames.com"sv, + "ue4.ddc"sv /* namespace */, + "https://epicgames.okta.com/oauth2/auso645ojjWVdRI3d0x7/v1/token"sv /* provider */, + "0oao91lrhqPiAlaGD0x7"sv /* client id */, + "-GBWjjenhCgOwhxL5yBKNJECVIoDPH0MK4RDuN7d"sv /* oauth secret */); + + m_AccessTracker = std::make_unique<AccessTracker>(); +} + +HttpKvCacheService::~HttpKvCacheService() +{ +} + +const char* +HttpKvCacheService::BaseUri() const +{ + return "/cache/"; +} + +void +HttpKvCacheService::HandleRequest(zen::HttpServerRequest& Request) +{ + using namespace std::literals; + + std::string_view Key = Request.RelativeUri(); + + switch (auto Verb = Request.RequestVerb()) + { + using enum zen::HttpVerb; + + case kHead: + case kGet: + { + m_AccessTracker->TrackAccess(Key); + + CacheValue Value; + bool Success = m_cache.Get(Key, Value); + + if (!Success) + { + // Success = m_cache_.Get(Key, Value); + + if (!Success) + { + CloudCacheSession Session(m_Cloud); + + zen::Stopwatch Timer; + + if (IoBuffer CloudValue = Session.Get("default", Key)) + { + Success = true; + + spdlog::debug("upstream HIT after {:5} {:6}! {}", + zen::NiceTimeSpanMs(Timer.getElapsedTimeMs()), + NiceBytes(CloudValue.Size()), + Key); + + Value.Value = CloudValue; + } + else + { + spdlog::debug("upstream miss after {:5}! {}", zen::NiceTimeSpanMs(Timer.getElapsedTimeMs()), Key); + } + } + + if (Success && (Value.Value.Size() <= m_InMemoryBlobSizeThreshold)) + { + m_cache.Put(Key, Value); + } + } + + if (!Success) + { + Request.WriteResponse(zen::HttpResponse::NotFound); + } + else + { + if (Verb == zen::HttpVerb::kHead) + { + Request.SetSuppressResponseBody(); + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Value.Value); + } + else + { + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Value.Value); + } + } + } + break; + + case kPut: + { + if (zen::IoBuffer Body = Request.ReadPayload()) + { + CacheValue Value; + Value.Value = Body; + + if (Value.Value.Size() <= m_InMemoryBlobSizeThreshold) + { + m_cache.Put(Key, Value); + } + + // m_cache_.Put(Key, Value); + + CloudCacheSession Session(m_Cloud); + + zen::Stopwatch Timer; + + Session.Put("default", Key, Value.Value); + + spdlog::debug("upstream PUT took {:5} {:6}! {}", + zen::NiceTimeSpanMs(Timer.getElapsedTimeMs()), + NiceBytes(Value.Value.Size()), + Key); + + Request.WriteResponse(zen::HttpResponse::Created); + } + else + { + return; + } + } + break; + + case kDelete: + // should this do anything? + return Request.WriteResponse(zen::HttpResponse::OK); + + case kPost: + break; + + default: + break; + } +} + +} // namespace zen diff --git a/zenserver/cache/kvcache.h b/zenserver/cache/kvcache.h new file mode 100644 index 000000000..e601582a4 --- /dev/null +++ b/zenserver/cache/kvcache.h @@ -0,0 +1,38 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> + +#include "cachestore.h" +#include "upstream/jupiter.h" + +namespace zen { + +/** + * Generic HTTP K/V cache - can be consumed via legacy DDC interfaces, with + * no key format conventions. Values are blobs + */ + +class HttpKvCacheService : public zen::HttpService +{ +public: + HttpKvCacheService(); + ~HttpKvCacheService(); + + virtual const char* BaseUri() const override; + virtual void HandleRequest(zen::HttpServerRequest& Request) override; + +private: + MemoryCacheStore m_cache; + FileCacheStore m_cache_{"E:\\Local-DDC-Write", "E:\\Local-DDC" /* Read */}; + RefPtr<CloudCacheClient> m_Cloud; + uint64_t m_InMemoryBlobSizeThreshold = 16384; + uint64_t m_FileBlobSizeThreshold = 16 * 1024 * 1024; + + struct AccessTracker; + + std::unique_ptr<AccessTracker> m_AccessTracker; +}; + +} // namespace zen diff --git a/zenserver/cache/structuredcache.cpp b/zenserver/cache/structuredcache.cpp new file mode 100644 index 000000000..0d62f297c --- /dev/null +++ b/zenserver/cache/structuredcache.cpp @@ -0,0 +1,129 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/fmtutils.h> +#include <zencore/httpserver.h> + +#include "cachestore.h" +#include "structuredcache.h" +#include "upstream/jupiter.h" + +#include <spdlog/spdlog.h> +#include <filesystem> + +namespace zen { + +HttpStructuredCacheService::HttpStructuredCacheService(std::filesystem::path RootPath, zen::CasStore& InStore) +: m_CasStore(InStore) +, m_CacheStore(InStore, RootPath) +{ + spdlog::info("initializing structured cache at '{}'", RootPath); +} + +HttpStructuredCacheService::~HttpStructuredCacheService() +{ + spdlog::info("closing structured cache"); +} + +const char* +HttpStructuredCacheService::BaseUri() const +{ + return "/z$/"; +} + +void +HttpStructuredCacheService::HandleRequest(zen::HttpServerRequest& Request) +{ + CacheRef Ref; + + if (!ValidateUri(Request, /* out */ Ref)) + { + return Request.WriteResponse(zen::HttpResponse::BadRequest); // invalid URL + } + + switch (auto Verb = Request.RequestVerb()) + { + using enum zen::HttpVerb; + + case kHead: + case kGet: + { + CacheValue Value; + bool Success = m_CacheStore.Get(Ref.BucketSegment, Ref.HashKey, /* out */ Value); + + if (!Success) + { + Request.WriteResponse(zen::HttpResponse::NotFound); + } + else + { + if (Verb == kHead) + { + Request.SetSuppressResponseBody(); + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Value.Value); + } + else + { + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Value.Value); + } + } + } + break; + + case kPut: + { + if (zen::IoBuffer Body = Request.ReadPayload()) + { + CacheValue Value; + Value.Value = Body; + + m_CacheStore.Put(Ref.BucketSegment, Ref.HashKey, Value); + + Request.WriteResponse(zen::HttpResponse::Created); + } + else + { + return; + } + } + break; + + case kPost: + break; + + default: + break; + } +} + +[[nodiscard]] bool +HttpStructuredCacheService::ValidateUri(zen::HttpServerRequest& Request, CacheRef& OutRef) +{ + std::string_view Key = Request.RelativeUri(); + std::string_view::size_type BucketSplitOffset = Key.find_last_of('/'); + + if (BucketSplitOffset == std::string_view::npos) + { + return false; + } + + OutRef.BucketSegment = Key.substr(0, BucketSplitOffset); + std::string_view HashSegment = Key.substr(BucketSplitOffset + 1); + + if (HashSegment.size() != (2 * sizeof OutRef.HashKey.Hash)) + { + return false; + } + + bool IsOk = zen::ParseHexBytes(HashSegment.data(), HashSegment.size(), OutRef.HashKey.Hash); + + if (!IsOk) + { + return false; + } + + return true; +} + +} // namespace zen diff --git a/zenserver/cache/structuredcache.h b/zenserver/cache/structuredcache.h new file mode 100644 index 000000000..d0646e6e9 --- /dev/null +++ b/zenserver/cache/structuredcache.h @@ -0,0 +1,40 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> + +#include "cachestore.h" +#include "upstream/jupiter.h" + +namespace zen { + +/** + * New-style cache service. Imposes constraints on keys, supports blobs and + * structured values + */ + +class HttpStructuredCacheService : public zen::HttpService +{ +public: + HttpStructuredCacheService(std::filesystem::path RootPath, zen::CasStore& InStore); + ~HttpStructuredCacheService(); + + virtual const char* BaseUri() const override; + + virtual void HandleRequest(zen::HttpServerRequest& Request) override; + +private: + struct CacheRef + { + std::string BucketSegment; + IoHash HashKey; + }; + + [[nodiscard]] bool ValidateUri(zen::HttpServerRequest& Request, CacheRef& OutRef); + + zen::CasStore& m_CasStore; + ZenCacheStore m_CacheStore; +}; + +} // namespace zen diff --git a/zenserver/casstore.cpp b/zenserver/casstore.cpp new file mode 100644 index 000000000..4afcf21a6 --- /dev/null +++ b/zenserver/casstore.cpp @@ -0,0 +1,155 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "casstore.h" + +#include <zencore/streamutil.h> + +#include <spdlog/spdlog.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +HttpCasService::HttpCasService(CasStore& Store) : m_CasStore(Store) +{ + m_Router.AddPattern("cas", "([0-9A-Fa-f]{40})"); + + m_Router.RegisterRoute( + "batch", + [this](HttpRouterRequest& Req) { + HttpServerRequest& ServerRequest = Req.ServerRequest(); + + IoBuffer Payload = ServerRequest.ReadPayload(); + uint64_t EntryCount = Payload.Size() / sizeof(IoHash); + + if ((EntryCount * sizeof(IoHash)) != Payload.Size()) + { + return ServerRequest.WriteResponse(HttpResponse::BadRequest); + } + + const IoHash* Hashes = reinterpret_cast<const IoHash*>(Payload.Data()); + std::vector<IoBuffer> Values; + + MemoryOutStream HeaderStream; + BinaryWriter HeaderWriter(HeaderStream); + + Values.emplace_back(); // Placeholder for header + + // Build response header + HeaderWriter << uint32_t(0x12340000) << uint32_t(0); + + for (uint64_t i = 0; i < EntryCount; ++i) + { + IoHash ChunkHash = Hashes[i]; + IoBuffer Value = m_CasStore.FindChunk(ChunkHash); + + if (Value) + { + Values.emplace_back(std::move(Value)); + HeaderWriter << ChunkHash << uint64_t(Value.Size()); + } + } + + // Make real header + + const_cast<uint32_t*>(reinterpret_cast<const uint32_t*>(HeaderStream.Data()))[1] = uint32_t(Values.size() - 1); + + Values[0] = IoBufferBuilder::MakeCloneFromMemory(HeaderStream.Data(), HeaderStream.Size()); + + ServerRequest.WriteResponse(HttpResponse::OK, HttpContentType::kBinary, Values); + }, + HttpVerb::kPost); + + m_Router.RegisterRoute( + "{cas}", + [this](HttpRouterRequest& Req) { + IoHash Hash = IoHash::FromHexString(Req.GetCapture(1)); + spdlog::debug("CAS request for {}", Hash); + + HttpServerRequest& ServerRequest = Req.ServerRequest(); + + switch (ServerRequest.RequestVerb()) + { + case HttpVerb::kGet: + case HttpVerb::kHead: + { + if (IoBuffer Value = m_CasStore.FindChunk(Hash)) + { + return ServerRequest.WriteResponse(HttpResponse::OK, HttpContentType::kBinary, Value); + } + + return ServerRequest.WriteResponse(HttpResponse::NotFound); + } + break; + + case HttpVerb::kPut: + { + IoBuffer Payload = ServerRequest.ReadPayload(); + IoHash PayloadHash = IoHash::HashMemory(Payload.Data(), Payload.Size()); + + // URI hash must match content hash + if (PayloadHash != Hash) + { + return ServerRequest.WriteResponse(HttpResponse::BadRequest); + } + + m_CasStore.InsertChunk(Payload.Data(), Payload.Size(), PayloadHash); + + return ServerRequest.WriteResponse(HttpResponse::OK); + } + break; + } + }, + HttpVerb::kGet | HttpVerb::kPut | HttpVerb::kHead); +} + +const char* +HttpCasService::BaseUri() const +{ + return "/cas/"; +} + +void +HttpCasService::HandleRequest(zen::HttpServerRequest& Request) +{ + if (Request.RelativeUri().empty()) + { + // Root URI request + + switch (Request.RequestVerb()) + { + case HttpVerb::kPut: + case HttpVerb::kPost: + { + IoBuffer Payload = Request.ReadPayload(); + IoHash PayloadHash = IoHash::HashMemory(Payload.Data(), Payload.Size()); + + spdlog::debug("CAS POST request for {} ({} bytes)", PayloadHash, Payload.Size()); + + auto InsertResult = m_CasStore.InsertChunk(Payload.Data(), Payload.Size(), PayloadHash); + + if (InsertResult.New) + { + return Request.WriteResponse(HttpResponse::Created); + } + else + { + return Request.WriteResponse(HttpResponse::OK); + } + } + break; + + case HttpVerb::kGet: + case HttpVerb::kHead: + break; + + default: + break; + } + } + else + { + m_Router.HandleRequest(Request); + } +} + +} // namespace zen diff --git a/zenserver/casstore.h b/zenserver/casstore.h new file mode 100644 index 000000000..7166f796e --- /dev/null +++ b/zenserver/casstore.h @@ -0,0 +1,34 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> +#include <zenstore/cas.h> + +namespace zen { + +/** + * Simple CAS store HTTP endpoint + * + * Note that since this does not end up pinning any of the chunks it's only really useful for a small subset of use cases where you know a + * chunk exists in the underlying CAS store. Thus it's mainly useful for internal use when communicating between Zen store instances + * + * Using this interface for adding CAS chunks makes little sense except for testing purposes as garbage collection may reap anything you add + * before anything ever gets to access it + */ + +class HttpCasService : public HttpService +{ +public: + explicit HttpCasService(CasStore& Store); + ~HttpCasService() = default; + + virtual const char* BaseUri() const override; + virtual void HandleRequest(zen::HttpServerRequest& Request) override; + +private: + CasStore& m_CasStore; + HttpRequestRouter m_Router; +}; + +} // namespace zen diff --git a/zenserver/config.cpp b/zenserver/config.cpp new file mode 100644 index 000000000..027427528 --- /dev/null +++ b/zenserver/config.cpp @@ -0,0 +1,157 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "config.h" + +#include "diag/logging.h" + +#include <zencore/fmtutils.h> +#include <zencore/iobuffer.h> +#include <zencore/string.h> + +#pragma warning(push) +#pragma warning(disable : 4267) // warning C4267: '=': conversion from 'size_t' to 'US', possible loss of data +#include <cxxopts.hpp> +#pragma warning(pop) + +#include <fmt/format.h> +#include <spdlog/spdlog.h> +#include <sol/sol.hpp> + +#if ZEN_PLATFORM_WINDOWS + +// Used for getting My Documents for default data directory +# include <ShlObj.h> +# pragma comment(lib, "shell32.lib") + +std::filesystem::path +PickDefaultStateDirectory() +{ + // Pick sensible default + + WCHAR myDocumentsDir[MAX_PATH]; + HRESULT hRes = SHGetFolderPathW(NULL, + CSIDL_PERSONAL /* My Documents */, + NULL, + SHGFP_TYPE_CURRENT, + /* out */ myDocumentsDir); + + if (SUCCEEDED(hRes)) + { + wcscat_s(myDocumentsDir, L"\\zen"); + + return myDocumentsDir; + } + + return L""; +} + +#else + +std::filesystem::path +PickDefaultStateDirectory() +{ + return std::filesystem::path("~/.zen"); +} + +#endif + +void +ParseGlobalCliOptions(int argc, char* argv[], ZenServerOptions& GlobalOptions) +{ + cxxopts::Options options("zenserver", "Zen Server"); + options.add_options()("d, debug", "Enable debugging", cxxopts::value<bool>(GlobalOptions.IsDebug)->default_value("false")); + options.add_options()("help", "Show command line help"); + options.add_options()("t, test", "Enable test mode", cxxopts::value<bool>(GlobalOptions.IsTest)->default_value("false")); + options.add_options()("log-id", "Specify id for adding context to log output", cxxopts::value<std::string>(GlobalOptions.LogId)); + options.add_options()("data-dir", "Specify persistence root", cxxopts::value<std::filesystem::path>(GlobalOptions.DataDir)); + + options + .add_option("lifetime", "", "owner-pid", "Specify owning process id", cxxopts::value<int>(GlobalOptions.OwnerPid), "<identifier>"); + options.add_option("lifetime", + "", + "child-id", + "Specify id which can be used to signal parent", + cxxopts::value<std::string>(GlobalOptions.ChildId), + "<identifier>"); + + options.add_option("network", + "p", + "port", + "Select HTTP port", + cxxopts::value<int>(GlobalOptions.BasePort)->default_value("1337"), + "<port number>"); + + try + { + auto result = options.parse(argc, argv); + + if (result.count("help")) + { + ConsoleLog().info("{}", options.help()); + + exit(0); + } + } + catch (cxxopts::OptionParseException& e) + { + ConsoleLog().error("Error parsing zenserver arguments: {}\n\n{}", e.what(), options.help()); + + throw; + } + + if (GlobalOptions.DataDir.empty()) + { + GlobalOptions.DataDir = PickDefaultStateDirectory(); + } +} + +void +ParseServiceConfig(const std::filesystem::path& DataRoot, ZenServiceConfig& ServiceConfig) +{ + using namespace fmt::literals; + + std::filesystem::path ConfigScript = DataRoot / "zen_cfg.lua"; + zen::IoBuffer LuaScript = zen::IoBufferBuilder::MakeFromFile(ConfigScript.native().c_str()); + + if (LuaScript) + { + sol::state lua; + + // Provide some context to help derive defaults + lua.set("dataroot", DataRoot.native()); + + lua.open_libraries(sol::lib::base); + + // We probably want to limit the scope of this so the script won't see + // any more than it needs to + + lua.set_function("getenv", [&](const std::string env) -> sol::object { + std::wstring EnvVarValue; + size_t RequiredSize = 0; + std::wstring EnvWide = zen::Utf8ToWide(env); + _wgetenv_s(&RequiredSize, nullptr, 0, EnvWide.c_str()); + + if (RequiredSize == 0) + return sol::make_object(lua, sol::lua_nil); + + EnvVarValue.resize(RequiredSize); + _wgetenv_s(&RequiredSize, EnvVarValue.data(), RequiredSize, EnvWide.c_str()); + return sol::make_object(lua, zen::WideToUtf8(EnvVarValue.c_str())); + }); + + try + { + sol::load_result config = lua.load(std::string_view((const char*)LuaScript.Data(), LuaScript.Size()), "zencfg"); + config(); + } + catch (std::exception& e) + { + spdlog::error("config script failure: {}", e.what()); + + throw std::exception("fatal zen global config script ({}) failure: {}"_format(ConfigScript, e.what()).c_str()); + } + ServiceConfig.LegacyCacheEnabled = lua["legacycache"]["enable"]; + const std::string path = lua["legacycache"]["readpath"]; + ServiceConfig.StructuredCacheEnabled = lua["structuredcache"]["enable"]; + } +} diff --git a/zenserver/config.h b/zenserver/config.h new file mode 100644 index 000000000..c96dc139a --- /dev/null +++ b/zenserver/config.h @@ -0,0 +1,28 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <filesystem> +#include <string> + +struct ZenServerOptions +{ + bool IsDebug = false; + bool IsTest = false; + int BasePort = 1337; // Service listen port (used for both UDP and TCP) + int OwnerPid = 0; // Parent process id (zero for standalone) + std::string ChildId; // Id assigned by parent process (used for lifetime management) + std::string LogId; // Id for tagging log output + std::filesystem::path DataDir; // Root directory for state (used for testing) + std::string FlockId; // Id for grouping test instances into sets +}; + +void ParseGlobalCliOptions(int argc, char* argv[], ZenServerOptions& GlobalOptions); + +struct ZenServiceConfig +{ + bool LegacyCacheEnabled = false; + bool StructuredCacheEnabled = true; +}; + +void ParseServiceConfig(const std::filesystem::path& DataRoot, ZenServiceConfig& ServiceConfig); diff --git a/zenserver/diag/crashreport.cpp b/zenserver/diag/crashreport.cpp new file mode 100644 index 000000000..03e74ca5c --- /dev/null +++ b/zenserver/diag/crashreport.cpp @@ -0,0 +1,85 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "crashreport.h" + +#include <zencore/filesystem.h> +#include <zencore/zencore.h> + +#include <client/windows/handler/exception_handler.h> + +#include <filesystem> + +// A callback function to run after the minidump has been written. +// minidump_id is a unique id for the dump, so the minidump +// file is <dump_path>\<minidump_id>.dmp. context is the parameter supplied +// by the user as callback_context when the handler was created. exinfo +// points to the exception record, or NULL if no exception occurred. +// succeeded indicates whether a minidump file was successfully written. +// assertion points to information about an assertion if the handler was +// invoked by an assertion. +// +// If an exception occurred and the callback returns true, Breakpad will treat +// the exception as fully-handled, suppressing any other handlers from being +// notified of the exception. If the callback returns false, Breakpad will +// treat the exception as unhandled, and allow another handler to handle it. +// If there are no other handlers, Breakpad will report the exception to the +// system as unhandled, allowing a debugger or native crash dialog the +// opportunity to handle the exception. Most callback implementations +// should normally return the value of |succeeded|, or when they wish to +// not report an exception of handled, false. Callbacks will rarely want to +// return true directly (unless |succeeded| is true). +// +// For out-of-process dump generation, dump path and minidump ID will always +// be NULL. In case of out-of-process dump generation, the dump path and +// minidump id are controlled by the server process and are not communicated +// back to the crashing process. + +static bool +CrashMinidumpCallback(const wchar_t* dump_path, + const wchar_t* minidump_id, + void* context, + EXCEPTION_POINTERS* exinfo, + MDRawAssertionInfo* assertion, + bool succeeded) +{ + ZEN_UNUSED(dump_path, minidump_id, context, exinfo, assertion, succeeded); + + // TODO! + return succeeded; +} + +// A callback function to run before Breakpad performs any substantial +// processing of an exception. A FilterCallback is called before writing +// a minidump. context is the parameter supplied by the user as +// callback_context when the handler was created. exinfo points to the +// exception record, if any; assertion points to assertion information, +// if any. +// +// If a FilterCallback returns true, Breakpad will continue processing, +// attempting to write a minidump. If a FilterCallback returns false, +// Breakpad will immediately report the exception as unhandled without +// writing a minidump, allowing another handler the opportunity to handle it. + +bool +CrashFilterCallback(void* context, EXCEPTION_POINTERS* exinfo, MDRawAssertionInfo* assertion) +{ + ZEN_UNUSED(context, exinfo, assertion); + + // Yes, write a dump + return false; +} + +void +InitializeCrashReporting(const std::filesystem::path& DumpPath) +{ + // handler_types specifies the types of handlers that should be installed. + + zen::CreateDirectories(DumpPath); + + static google_breakpad::ExceptionHandler _(DumpPath.native().c_str(), // Dump path + CrashFilterCallback, // Filter Callback + CrashMinidumpCallback, // Minidump callback + nullptr, // Callback context + google_breakpad::ExceptionHandler::HANDLER_ALL // Handler Types + ); +} diff --git a/zenserver/diag/crashreport.h b/zenserver/diag/crashreport.h new file mode 100644 index 000000000..6369d1cf5 --- /dev/null +++ b/zenserver/diag/crashreport.h @@ -0,0 +1,9 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +namespace std::filesystem { +class path; +} + +void InitializeCrashReporting(const std::filesystem::path& DumpPath); diff --git a/zenserver/diag/diagsvcs.h b/zenserver/diag/diagsvcs.h new file mode 100644 index 000000000..84f8d22ee --- /dev/null +++ b/zenserver/diag/diagsvcs.h @@ -0,0 +1,103 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> +#include <zencore/iobuffer.h> + +////////////////////////////////////////////////////////////////////////// + +class HttpTestService : public zen::HttpService +{ + uint32_t LogPoint = 0; + +public: + HttpTestService() {} + ~HttpTestService() = default; + + virtual const char* BaseUri() const override { return "/test/"; } + + virtual void HandleRequest(zen::HttpServerRequest& Request) override + { + using namespace std::literals; + + auto Uri = Request.RelativeUri(); + + if (Uri == "hello"sv) + { + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kText, u8"hello world!"sv); + + // OutputLogMessageInternal(&LogPoint, 0, 0); + } + else if (Uri == "1K"sv) + { + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, m_1k); + } + else if (Uri == "1M"sv) + { + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, m_1m); + } + else if (Uri == "1M_1k"sv) + { + std::vector<zen::IoBuffer> Buffers; + Buffers.reserve(1024); + + for (int i = 0; i < 1024; ++i) + { + Buffers.push_back(m_1k); + } + + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Buffers); + } + else if (Uri == "1G"sv) + { + std::vector<zen::IoBuffer> Buffers; + Buffers.reserve(1024); + + for (int i = 0; i < 1024; ++i) + { + Buffers.push_back(m_1m); + } + + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Buffers); + } + else if (Uri == "1G_1k"sv) + { + std::vector<zen::IoBuffer> Buffers; + Buffers.reserve(1024 * 1024); + + for (int i = 0; i < 1024 * 1024; ++i) + { + Buffers.push_back(m_1k); + } + + Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kBinary, Buffers); + } + } + +private: + zen::IoBuffer m_1m{1024 * 1024}; + zen::IoBuffer m_1k{m_1m, 0u, 1024}; +}; + +class HttpHealthService : public zen::HttpService +{ +public: + HttpHealthService() = default; + ~HttpHealthService() = default; + + virtual const char* BaseUri() const override { return "/health/"; } + + virtual void HandleRequest(zen::HttpServerRequest& Request) override + { + using namespace std::literals; + + switch (Request.RequestVerb()) + { + case zen::HttpVerb::kGet: + return Request.WriteResponse(zen::HttpResponse::OK, zen::HttpContentType::kText, u8"OK!"sv); + } + } + +private: +}; diff --git a/zenserver/diag/logging.cpp b/zenserver/diag/logging.cpp new file mode 100644 index 000000000..2bf0e50aa --- /dev/null +++ b/zenserver/diag/logging.cpp @@ -0,0 +1,204 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "logging.h" + +#include "config.h" + +#include <spdlog/pattern_formatter.h> +#include <spdlog/sinks/ansicolor_sink.h> +#include <spdlog/sinks/stdout_color_sinks.h> +#include <spdlog/spdlog.h> +#include <memory> + +// Custom logging -- test code, this should be tweaked + +namespace logging { + +using namespace spdlog; +using namespace spdlog::details; +using namespace std::literals; + +class full_formatter final : public spdlog::formatter +{ +public: + full_formatter(std::string_view LogId, std::chrono::time_point<std::chrono::system_clock> Epoch) : m_Epoch(Epoch), m_LogId(LogId) {} + + virtual std::unique_ptr<formatter> clone() const override { return std::make_unique<full_formatter>(m_LogId, m_Epoch); } + + static constexpr bool UseDate = false; + + virtual void format(const details::log_msg& msg, memory_buf_t& dest) override + { + using std::chrono::duration_cast; + using std::chrono::milliseconds; + using std::chrono::seconds; + + if constexpr (UseDate) + { + auto secs = std::chrono::duration_cast<seconds>(msg.time.time_since_epoch()); + if (secs != m_LastLogSecs) + { + m_CachedTm = os::localtime(log_clock::to_time_t(msg.time)); + m_LastLogSecs = secs; + } + } + + const auto& tm_time = m_CachedTm; + + // cache the date/time part for the next second. + auto duration = msg.time - m_Epoch; + auto secs = duration_cast<seconds>(duration); + + if (m_CacheTimestamp != secs || m_CachedDatetime.size() == 0) + { + m_CachedDatetime.clear(); + m_CachedDatetime.push_back('['); + + if constexpr (UseDate) + { + fmt_helper::append_int(tm_time.tm_year + 1900, m_CachedDatetime); + m_CachedDatetime.push_back('-'); + + fmt_helper::pad2(tm_time.tm_mon + 1, m_CachedDatetime); + m_CachedDatetime.push_back('-'); + + fmt_helper::pad2(tm_time.tm_mday, m_CachedDatetime); + m_CachedDatetime.push_back(' '); + + fmt_helper::pad2(tm_time.tm_hour, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + + fmt_helper::pad2(tm_time.tm_min, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + + fmt_helper::pad2(tm_time.tm_sec, m_CachedDatetime); + } + else + { + int Count = int(secs.count()); + + const int LogSecs = Count % 60; + Count /= 60; + + const int LogMins = Count % 60; + Count /= 60; + + const int LogHours = Count; + + fmt_helper::pad2(LogHours, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + fmt_helper::pad2(LogMins, m_CachedDatetime); + m_CachedDatetime.push_back(':'); + fmt_helper::pad2(LogSecs, m_CachedDatetime); + } + + m_CachedDatetime.push_back('.'); + + m_CacheTimestamp = secs; + } + + dest.append(m_CachedDatetime.begin(), m_CachedDatetime.end()); + + auto millis = fmt_helper::time_fraction<milliseconds>(msg.time); + fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest); + dest.push_back(']'); + dest.push_back(' '); + + if (!m_LogId.empty()) + { + dest.push_back('['); + fmt_helper::append_string_view(m_LogId, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + // append logger name if exists + if (msg.logger_name.size() > 0) + { + dest.push_back('['); + fmt_helper::append_string_view(msg.logger_name, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + dest.push_back('['); + // wrap the level name with color + msg.color_range_start = dest.size(); + fmt_helper::append_string_view(level::to_string_view(msg.level), dest); + msg.color_range_end = dest.size(); + dest.push_back(']'); + dest.push_back(' '); + + // add source location if present + if (!msg.source.empty()) + { + dest.push_back('['); + const char* filename = details::short_filename_formatter<details::null_scoped_padder>::basename(msg.source.filename); + fmt_helper::append_string_view(filename, dest); + dest.push_back(':'); + fmt_helper::append_int(msg.source.line, dest); + dest.push_back(']'); + dest.push_back(' '); + } + + fmt_helper::append_string_view(msg.payload, dest); + fmt_helper::append_string_view("\n"sv, dest); + } + +private: + std::chrono::time_point<std::chrono::system_clock> m_Epoch; + std::tm m_CachedTm; + std::chrono::seconds m_LastLogSecs; + std::chrono::seconds m_CacheTimestamp{0}; + memory_buf_t m_CachedDatetime; + std::string m_LogId; +}; + +} // namespace logging + +bool +EnableVTMode() +{ + // Set output mode to handle virtual terminal sequences + HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE); + if (hOut == INVALID_HANDLE_VALUE) + { + return false; + } + + DWORD dwMode = 0; + if (!GetConsoleMode(hOut, &dwMode)) + { + return false; + } + + dwMode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING; + if (!SetConsoleMode(hOut, dwMode)) + { + return false; + } + + return true; +} + +void +InitializeLogging(const ZenServerOptions& GlobalOptions) +{ + EnableVTMode(); + + auto& sinks = spdlog::default_logger()->sinks(); + sinks.clear(); + sinks.push_back(std::make_shared<spdlog::sinks::ansicolor_stdout_sink_mt>()); + spdlog::set_level(spdlog::level::debug); + spdlog::set_formatter(std::make_unique<logging::full_formatter>(GlobalOptions.LogId, std::chrono::system_clock::now())); +} + +spdlog::logger& +ConsoleLog() +{ + static auto ConLogger = spdlog::stdout_color_mt("console"); + + ConLogger->set_pattern("%v"); + + return *ConLogger; +} diff --git a/zenserver/diag/logging.h b/zenserver/diag/logging.h new file mode 100644 index 000000000..1b1813913 --- /dev/null +++ b/zenserver/diag/logging.h @@ -0,0 +1,11 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <spdlog/spdlog.h> + +struct ZenServerOptions; + +void InitializeLogging(const ZenServerOptions& GlobalOptions); + +spdlog::logger& ConsoleLog(); diff --git a/zenserver/experimental/usnjournal.cpp b/zenserver/experimental/usnjournal.cpp new file mode 100644 index 000000000..f44e50945 --- /dev/null +++ b/zenserver/experimental/usnjournal.cpp @@ -0,0 +1,341 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "usnjournal.h" + +#include <zencore/except.h> +#include <zencore/timer.h> +#include <zencore/zencore.h> + +#include <spdlog/spdlog.h> + +#include <atlfile.h> + +#include <filesystem> + +namespace zen { + +UsnJournalReader::UsnJournalReader() +{ +} + +UsnJournalReader::~UsnJournalReader() +{ + delete[] m_JournalReadBuffer; +} + +bool +UsnJournalReader::Initialize(std::filesystem::path VolumePath) +{ + TCHAR VolumeName[MAX_PATH]; + TCHAR VolumePathName[MAX_PATH]; + + { + auto NativePath = VolumePath.native(); + BOOL Success = GetVolumePathName(NativePath.c_str(), VolumePathName, ZEN_ARRAY_COUNT(VolumePathName)); + + if (!Success) + { + zen::ThrowSystemException("GetVolumePathName failed"); + } + + Success = GetVolumeNameForVolumeMountPoint(VolumePathName, VolumeName, ZEN_ARRAY_COUNT(VolumeName)); + + if (!Success) + { + zen::ThrowSystemException("GetVolumeNameForVolumeMountPoint failed"); + } + + // Chop off trailing slash since we want to open a volume handle, not a handle to the volume root directory + + const size_t VolumeNameLength = wcslen(VolumeName); + + if (VolumeNameLength) + { + VolumeName[VolumeNameLength - 1] = '\0'; + } + } + + m_VolumeHandle = CreateFile(VolumeName, + GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + nullptr, /* no custom security */ + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, + nullptr); /* template */ + + if (m_VolumeHandle == INVALID_HANDLE_VALUE) + { + ThrowSystemException("Volume handle open failed"); + } + + // Figure out which file system is in use for volume + + { + WCHAR InfoVolumeName[MAX_PATH + 1]{}; + WCHAR FileSystemName[MAX_PATH + 1]{}; + DWORD MaximumComponentLength = 0; + DWORD FileSystemFlags = 0; + + BOOL Success = GetVolumeInformationByHandleW(m_VolumeHandle, + InfoVolumeName, + MAX_PATH + 1, + NULL, + &MaximumComponentLength, + &FileSystemFlags, + FileSystemName, + ZEN_ARRAY_COUNT(FileSystemName)); + + if (!Success) + { + ThrowSystemException("Failed to get volume information"); + } + + spdlog::debug("File system type is {}", WideToUtf8(FileSystemName)); + + if (wcscmp(L"ReFS", FileSystemName) == 0) + { + m_FileSystemType = FileSystemType::ReFS; + } + else if (wcscmp(L"NTFS", FileSystemName) == 0) + { + m_FileSystemType = FileSystemType::NTFS; + } + else + { + // Unknown file system type! + } + } + + // Determine if volume is on fast storage, where seeks aren't so expensive + + { + STORAGE_PROPERTY_QUERY StorageQuery{}; + StorageQuery.PropertyId = StorageDeviceSeekPenaltyProperty; + StorageQuery.QueryType = PropertyStandardQuery; + DWORD BytesWritten; + DEVICE_SEEK_PENALTY_DESCRIPTOR Result{}; + + if (DeviceIoControl(m_VolumeHandle, + IOCTL_STORAGE_QUERY_PROPERTY, + &StorageQuery, + sizeof(StorageQuery), + &Result, + sizeof(Result), + &BytesWritten, + nullptr)) + { + m_IncursSeekPenalty = !!Result.IncursSeekPenalty; + } + } + + // Query Journal + + USN_JOURNAL_DATA_V2 UsnData{}; + + { + DWORD BytesWritten = 0; + + const BOOL Success = + DeviceIoControl(m_VolumeHandle, FSCTL_QUERY_USN_JOURNAL, nullptr, 0, &UsnData, sizeof UsnData, &BytesWritten, nullptr); + + if (!Success) + { + switch (DWORD Error = GetLastError()) + { + case ERROR_JOURNAL_NOT_ACTIVE: + spdlog::info("No USN journal active on drive"); + + // TODO: optionally activate USN journal on drive? + + ThrowSystemException(HRESULT_FROM_WIN32(Error), "No USN journal active on drive"); + break; + + default: + ThrowSystemException(HRESULT_FROM_WIN32(Error), "FSCTL_QUERY_USN_JOURNAL failed"); + } + } + } + + m_JournalReadBuffer = new uint8_t[m_ReadBufferSize]; + + // Catch up to USN start + + CAtlFile VolumeRootDir; + HRESULT hRes = + VolumeRootDir.Create(VolumePathName, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS); + + ThrowIfFailed(hRes, "Failed to open handle to volume root"); + + FILE_ID_INFO FileInformation{}; + BOOL Success = GetFileInformationByHandleEx(VolumeRootDir, FileIdInfo, &FileInformation, sizeof FileInformation); + + if (!Success) + { + ThrowSystemException("GetFileInformationByHandleEx failed"); + } + + const Frn VolumeRootFrn = FileInformation.FileId; + + // Enumerate MFT (but not for ReFS) + + if (m_FileSystemType == FileSystemType::NTFS) + { + spdlog::info("Enumerating MFT for {}", WideToUtf8(VolumePathName)); + + zen::Stopwatch Timer; + uint64_t MftBytesProcessed = 0; + + MFT_ENUM_DATA_V1 MftEnumData{.StartFileReferenceNumber = 0, .LowUsn = 0, .HighUsn = 0, .MinMajorVersion = 2, .MaxMajorVersion = 3}; + + BYTE MftBuffer[64 * 1024 + sizeof(DWORDLONG)]; + DWORD BytesWritten = 0; + + for (;;) + { + Success = DeviceIoControl(m_VolumeHandle, + FSCTL_ENUM_USN_DATA, + &MftEnumData, + sizeof MftEnumData, + MftBuffer, + sizeof MftBuffer, + &BytesWritten, + nullptr); + + if (!Success) + { + DWORD Error = GetLastError(); + + if (Error == ERROR_HANDLE_EOF) + { + break; + } + + ThrowSystemException(HRESULT_FROM_WIN32(Error), "FSCTL_ENUM_USN_DATA failed"); + } + + void* BufferEnd = (void*)&MftBuffer[BytesWritten]; + + // The enumeration call returns the next FRN ahead of the other data in the buffer + MftEnumData.StartFileReferenceNumber = ((DWORDLONG*)MftBuffer)[0]; + + PUSN_RECORD_UNION CommonRecord = PUSN_RECORD_UNION(&((DWORDLONG*)MftBuffer)[1]); + + while (CommonRecord < BufferEnd) + { + switch (CommonRecord->Header.MajorVersion) + { + case 2: + { + USN_RECORD_V2& Record = CommonRecord->V2; + + const Frn FileReference = Record.FileReferenceNumber; + const Frn ParentReference = Record.ParentFileReferenceNumber; + std::wstring_view FileName{Record.FileName, Record.FileNameLength}; + } + break; + case 3: + { + USN_RECORD_V3& Record = CommonRecord->V3; + + const Frn FileReference = Record.FileReferenceNumber; + const Frn ParentReference = Record.ParentFileReferenceNumber; + std::wstring_view FileName{Record.FileName, Record.FileNameLength}; + } + break; + case 4: + { + // This captures file modification ranges. We do not yet support this however + USN_RECORD_V4& Record = CommonRecord->V4; + } + break; + } + + const DWORD RecordLength = CommonRecord->Header.RecordLength; + CommonRecord = PUSN_RECORD_UNION(((uint8_t*)CommonRecord) + RecordLength); + MftBytesProcessed += RecordLength; + } + } + + const auto ElapsedMs = Timer.getElapsedTimeMs(); + + spdlog::info("MFT enumeration of {} completed after {} ({})", + zen::NiceBytes(MftBytesProcessed), + zen::NiceTimeSpanMs(ElapsedMs), + zen::NiceByteRate(MftBytesProcessed, ElapsedMs)); + } + + // Populate by traversal + if (m_FileSystemType == FileSystemType::ReFS) + { + uint64_t FileInfoBuffer[8 * 1024]; + + FILE_INFO_BY_HANDLE_CLASS FibClass = FileIdBothDirectoryRestartInfo; + bool Continue = true; + + while (Continue) + { + Success = GetFileInformationByHandleEx(VolumeRootDir, FibClass, FileInfoBuffer, sizeof FileInfoBuffer); + FibClass = FileIdBothDirectoryInfo; // Set up for next iteration + + uint64_t EntryOffset = 0; + + if (!Success) + { + // Report failure? + + break; + } + + do + { + const FILE_ID_BOTH_DIR_INFO* DirInfo = + reinterpret_cast<const FILE_ID_BOTH_DIR_INFO*>(reinterpret_cast<const uint8_t*>(FileInfoBuffer) + EntryOffset); + + const uint64_t NextOffset = DirInfo->NextEntryOffset; + + if (NextOffset == 0) + { + if (EntryOffset == 0) + { + // First and last - end of iteration + Continue = false; + } + break; + } + + if (DirInfo->FileAttributes & FILE_ATTRIBUTE_DIRECTORY) + { + // TODO Directory + } + else if (DirInfo->FileAttributes & FILE_ATTRIBUTE_DEVICE) + { + // TODO Device + } + else + { + // TODO File + } + + EntryOffset += DirInfo->NextEntryOffset; + } while (EntryOffset); + } + } + + // Initialize journal reading + + m_ReadUsnJournalData = {.StartUsn = UsnData.FirstUsn, + .ReasonMask = USN_REASON_BASIC_INFO_CHANGE | USN_REASON_CLOSE | USN_REASON_DATA_EXTEND | + USN_REASON_DATA_OVERWRITE | USN_REASON_DATA_TRUNCATION | USN_REASON_FILE_CREATE | + USN_REASON_FILE_DELETE | USN_REASON_HARD_LINK_CHANGE | USN_REASON_RENAME_NEW_NAME | + USN_REASON_RENAME_OLD_NAME | USN_REASON_REPARSE_POINT_CHANGE, + .ReturnOnlyOnClose = true, + .Timeout = 0, + .BytesToWaitFor = 0, + .UsnJournalID = UsnData.UsnJournalID, + .MinMajorVersion = 0, + .MaxMajorVersion = 0}; + + return false; +} + +} // namespace zen diff --git a/zenserver/experimental/usnjournal.h b/zenserver/experimental/usnjournal.h new file mode 100644 index 000000000..9c1008d52 --- /dev/null +++ b/zenserver/experimental/usnjournal.h @@ -0,0 +1,62 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/windows.h> + +#include <winioctl.h> + +#include <filesystem> + +namespace zen { + +class UsnJournalReader +{ +public: + UsnJournalReader(); + ~UsnJournalReader(); + + bool Initialize(std::filesystem::path VolumePath); + +private: + void* m_VolumeHandle; + READ_USN_JOURNAL_DATA_V1 m_ReadUsnJournalData; + bool m_IncursSeekPenalty = true; + + uint8_t* m_JournalReadBuffer = nullptr; + uint64_t m_ReadBufferSize = 64 * 1024; + + struct Frn + { + uint8_t IdBytes[16]; + + Frn() = default; + + Frn(const FILE_ID_128& Rhs) { memcpy(IdBytes, Rhs.Identifier, sizeof IdBytes); } + Frn& operator=(const FILE_ID_128& Rhs) { memcpy(IdBytes, Rhs.Identifier, sizeof IdBytes); } + + Frn(const uint64_t& Rhs) + { + memcpy(IdBytes, &Rhs, sizeof Rhs); + memset(&IdBytes[8], 0, 8); + } + + Frn& operator=(const uint64_t& Rhs) + { + memcpy(IdBytes, &Rhs, sizeof Rhs); + memset(&IdBytes[8], 0, 8); + } + + std::strong_ordering operator<=>(const Frn&) const = default; + }; + + enum class FileSystemType + { + ReFS, + NTFS + }; + + FileSystemType m_FileSystemType = FileSystemType::NTFS; +}; + +} // namespace zen diff --git a/zenserver/experimental/vfs.cpp b/zenserver/experimental/vfs.cpp new file mode 100644 index 000000000..1af9d70a7 --- /dev/null +++ b/zenserver/experimental/vfs.cpp @@ -0,0 +1,3 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "vfs.h" diff --git a/zenserver/experimental/vfs.h b/zenserver/experimental/vfs.h new file mode 100644 index 000000000..1aeefe481 --- /dev/null +++ b/zenserver/experimental/vfs.h @@ -0,0 +1,5 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> diff --git a/zenserver/projectstore.cpp b/zenserver/projectstore.cpp new file mode 100644 index 000000000..0dc0da1ae --- /dev/null +++ b/zenserver/projectstore.cpp @@ -0,0 +1,1547 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "projectstore.h" + +#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinarypackage.h> +#include <zencore/compactbinaryvalidation.h> +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/stream.h> +#include <zencore/string.h> +#include <zencore/timer.h> +#include <zencore/windows.h> +#include <zenstore/cas.h> +#include <zenstore/caslog.h> + +#pragma comment(lib, "Rpcrt4.lib") // RocksDB made me do this +#include <rocksdb/db.h> + +#include <lmdb.h> +#include <ppl.h> +#include <spdlog/spdlog.h> +#include <xxh3.h> +#include <asio.hpp> +#include <future> +#include <latch> + +namespace zen { + +namespace rocksdb = ROCKSDB_NAMESPACE; +using namespace fmt::literals; + +////////////////////////////////////////////////////////////////////////// + +struct ProjectStore::OplogStorage : public RefCounted +{ + OplogStorage(ProjectStore::Oplog* OwnerOplog, std::filesystem::path BasePath) : m_OwnerOplog(OwnerOplog), m_OplogStoragePath(BasePath) + { + } + + ~OplogStorage() + { + Log().info("closing oplog storage at {}", m_OplogStoragePath); + Flush(); + + if (m_LmdbEnv) + { + mdb_env_close(m_LmdbEnv); + m_LmdbEnv = nullptr; + } + + if (m_RocksDb) + { + // Column families must be torn down before database is closed + for (const auto& Handle : m_RocksDbColumnHandles) + { + m_RocksDb->DestroyColumnFamilyHandle(Handle); + } + + rocksdb::Status Status = m_RocksDb->Close(); + + if (!Status.ok()) + { + Log().warn("db close error reported for '{}' : '{}'", m_OplogStoragePath, Status.getState()); + } + } + } + + [[nodiscard]] bool Exists() { return Exists(m_OplogStoragePath); } + [[nodiscard]] static bool Exists(std::filesystem::path BasePath) + { + return std::filesystem::exists(BasePath / "ops.zlog") && std::filesystem::exists(BasePath / "ops.zdb") && + std::filesystem::exists(BasePath / "ops.zops"); + } + + static bool Delete(std::filesystem::path BasePath) { return DeleteDirectories(BasePath); } + + void Open(bool IsCreate) + { + Log().info("initializing oplog storage at '{}'", m_OplogStoragePath); + + if (IsCreate) + { + DeleteDirectories(m_OplogStoragePath); + CreateDirectories(m_OplogStoragePath); + } + + m_Oplog.Open(m_OplogStoragePath / "ops.zlog", IsCreate); + m_Oplog.Initialize(); + + m_OpBlobs.Open(m_OplogStoragePath / "ops.zops", IsCreate); + + ZEN_ASSERT(IsPow2(m_OpsAlign)); + ZEN_ASSERT(!(m_NextOpsOffset & (m_OpsAlign - 1))); + + { + std::string LmdbPath = WideToUtf8((m_OplogStoragePath / "ops.zdb").native().c_str()); + + int rc = mdb_env_create(&m_LmdbEnv); + rc = mdb_env_set_mapsize(m_LmdbEnv, 8 * 1024 * 1024); + rc = mdb_env_set_maxreaders(m_LmdbEnv, 256); + rc = mdb_env_open(m_LmdbEnv, LmdbPath.c_str(), MDB_NOSUBDIR | MDB_WRITEMAP | MDB_NOMETASYNC | MDB_NOSYNC, 0666); + } + + { + std::string RocksdbPath = WideToUtf8((m_OplogStoragePath / "ops.rdb").native().c_str()); + + Log().debug("opening rocksdb db at '{}'", RocksdbPath); + + rocksdb::DB* Db; + rocksdb::DBOptions Options; + Options.create_if_missing = true; + + std::vector<std::string> ExistingColumnFamilies; + rocksdb::Status Status = rocksdb::DB::ListColumnFamilies(Options, RocksdbPath, &ExistingColumnFamilies); + + std::vector<rocksdb::ColumnFamilyDescriptor> ColumnDescriptors; + + if (Status.IsPathNotFound()) + { + ColumnDescriptors.emplace_back(rocksdb::ColumnFamilyDescriptor{rocksdb::kDefaultColumnFamilyName, {}}); + } + else if (Status.ok()) + { + for (const std::string& Column : ExistingColumnFamilies) + { + rocksdb::ColumnFamilyDescriptor ColumnFamily; + ColumnFamily.name = Column; + ColumnDescriptors.push_back(ColumnFamily); + } + } + else + { + throw std::exception("column family iteration failed for '{}': '{}'"_format(RocksdbPath, Status.getState()).c_str()); + } + + Status = rocksdb::DB::Open(Options, RocksdbPath, ColumnDescriptors, &m_RocksDbColumnHandles, &Db); + + if (!Status.ok()) + { + throw std::exception("database open failed for '{}': '{}'"_format(RocksdbPath, Status.getState()).c_str()); + } + + m_RocksDb.reset(Db); + } + } + + void ReplayLog(std::function<void(CbObject, const OplogEntry&)>&& Handler) + { + // This could use memory mapping or do something clever but for now it just reads the file sequentially + + spdlog::info("replaying log for '{}'", m_OplogStoragePath); + + Stopwatch Timer; + + m_Oplog.Replay([&](const zen::OplogEntry& LogEntry) { + IoBuffer OpBuffer(LogEntry.OpCoreSize); + + const uint64_t OpFileOffset = LogEntry.OpCoreOffset * m_OpsAlign; + + m_OpBlobs.Read((void*)OpBuffer.Data(), LogEntry.OpCoreSize, OpFileOffset); + + // Verify checksum, ignore op data if incorrect + const auto OpCoreHash = uint32_t(XXH3_64bits(OpBuffer.Data(), OpBuffer.Size()) & 0xffffFFFF); + + if (OpCoreHash != LogEntry.OpCoreHash) + { + Log().warn("skipping oplog entry with bad checksum!"); + return; + } + + CbObject Op(SharedBuffer::MakeView(OpBuffer.Data(), OpBuffer.Size())); + + m_NextOpsOffset = + Max(m_NextOpsOffset.load(std::memory_order::memory_order_relaxed), RoundUp(OpFileOffset + LogEntry.OpCoreSize, m_OpsAlign)); + m_MaxLsn = Max(m_MaxLsn.load(std::memory_order::memory_order_relaxed), LogEntry.OpLsn); + + Handler(Op, LogEntry); + }); + + spdlog::info("Oplog replay completed in {} - Max LSN# {}, Next offset: {}", + NiceTimeSpanMs(Timer.getElapsedTimeMs()), + m_MaxLsn, + m_NextOpsOffset); + } + + OplogEntry AppendOp(CbObject Op) + { + SharedBuffer Buffer = Op.GetBuffer(); + const uint64_t WriteSize = Buffer.GetSize(); + const auto OpCoreHash = uint32_t(XXH3_64bits(Buffer.GetData(), WriteSize) & 0xffffFFFF); + + XXH3_128Stream KeyHasher; + Op["key"].WriteToStream([&](const void* Data, size_t Size) { KeyHasher.Append(Data, Size); }); + XXH3_128 KeyHash = KeyHasher.GetHash(); + + RwLock::ExclusiveLockScope _(m_RwLock); + const uint64_t WriteOffset = m_NextOpsOffset; + const uint32_t OpLsn = ++m_MaxLsn; + + m_NextOpsOffset = RoundUp(WriteOffset + WriteSize, m_OpsAlign); + + ZEN_ASSERT(IsMultipleOf(WriteOffset, m_OpsAlign)); + + OplogEntry Entry = {.OpLsn = OpLsn, + .OpCoreOffset = gsl::narrow_cast<uint32_t>(WriteOffset / m_OpsAlign), + .OpCoreSize = uint32_t(Buffer.GetSize()), + .OpCoreHash = OpCoreHash, + .OpKeyHash = KeyHash}; + + m_Oplog.Append(Entry); + + m_OpBlobs.Write(Buffer.GetData(), WriteSize, WriteOffset); + + return Entry; + } + + void Flush() + { + m_Oplog.Flush(); + m_OpBlobs.Flush(); + } + + spdlog::logger& Log() { return m_OwnerOplog->Log(); } + +private: + ProjectStore::Oplog* m_OwnerOplog; + std::filesystem::path m_OplogStoragePath; + RwLock m_RwLock; + TCasLogFile<OplogEntry> m_Oplog; + CasBlobFile m_OpBlobs; + std::atomic<uint64_t> m_NextOpsOffset{0}; + uint64_t m_OpsAlign = 32; + std::atomic<uint32_t> m_MaxLsn{0}; + MDB_env* m_LmdbEnv = nullptr; + std::unique_ptr<rocksdb::DB> m_RocksDb; + std::vector<rocksdb::ColumnFamilyHandle*> m_RocksDbColumnHandles; +}; + +////////////////////////////////////////////////////////////////////////// + +ProjectStore::Oplog::Oplog(std::string_view Id, Project* Outer, CasStore& Store, std::filesystem::path BasePath) +: m_OuterProject(Outer) +, m_CasStore(Store) +, m_OplogId(Id) +, m_BasePath(BasePath) +{ + m_Storage = new OplogStorage(this, m_BasePath); + const bool StoreExists = m_Storage->Exists(); + m_Storage->Open(/* IsCreate */ !StoreExists); + + m_TempPath = m_BasePath / "temp"; + + zen::CleanDirectory(m_TempPath); +} + +ProjectStore::Oplog::~Oplog() = default; + +bool +ProjectStore::Oplog::ExistsAt(std::filesystem::path BasePath) +{ + return OplogStorage::Exists(BasePath); +} + +void +ProjectStore::Oplog::ReplayLog() +{ + m_Storage->ReplayLog([&](CbObject Op, const OplogEntry& OpEntry) { RegisterOplogEntry(Op, OpEntry, kUpdateReplay); }); +} + +IoBuffer +ProjectStore::Oplog::FindChunk(Oid ChunkId) +{ + if (auto ChunkIt = m_ChunkMap.find(ChunkId); ChunkIt != m_ChunkMap.end()) + { + return m_CasStore.FindChunk(ChunkIt->second); + } + + if (auto FileIt = m_ServerFileMap.find(ChunkId); FileIt != m_ServerFileMap.end()) + { + std::filesystem::path FilePath = m_OuterProject->RootDir / FileIt->second; + + return IoBufferBuilder::MakeFromFile(FilePath.native().c_str()); + } + + if (auto MetaIt = m_MetaMap.find(ChunkId); MetaIt != m_MetaMap.end()) + { + return m_CasStore.FindChunk(MetaIt->second); + } + + return {}; +} + +void +ProjectStore::Oplog::IterateFileMap(std::function<void(const Oid&, const std::string_view&)>&& Fn) +{ + for (const auto& Kv : m_FileMap) + { + Fn(Kv.first, Kv.second); + } +} + +void +ProjectStore::Oplog::AddFileMapping(Oid FileId, std::string_view Path) +{ + m_FileMap.emplace(FileId, Path); +} + +void +ProjectStore::Oplog::AddServerFileMapping(Oid FileId, std::string_view Path) +{ + m_ServerFileMap.emplace(FileId, Path); +} + +void +ProjectStore::Oplog::AddChunkMapping(Oid ChunkId, IoHash Hash) +{ + m_ChunkMap.emplace(ChunkId, Hash); +} + +void +ProjectStore::Oplog::AddMetaMapping(Oid ChunkId, IoHash Hash) +{ + m_MetaMap.emplace(ChunkId, Hash); +} + +uint32_t +ProjectStore::Oplog::RegisterOplogEntry(CbObject Core, const OplogEntry& OpEntry, UpdateType TypeOfUpdate) +{ + ZEN_UNUSED(TypeOfUpdate); + + using namespace std::literals; + + // Update chunk id maps + + if (Core["package"sv]) + { + CbObjectView PkgObj = Core["package"sv].AsObjectView(); + Oid PackageId = PkgObj["id"sv].AsObjectId(); + IoHash PackageHash = PkgObj["data"sv].AsBinaryAttachment(); + + AddChunkMapping(PackageId, PackageHash); + + Log().debug("package data {} -> {}", PackageId, PackageHash); + } + + for (CbFieldView& Entry : Core["bulkdata"sv]) + { + CbObjectView BulkObj = Entry.AsObjectView(); + + Oid BulkDataId = BulkObj["id"sv].AsObjectId(); + IoHash BulkDataHash = BulkObj["data"sv].AsBinaryAttachment(); + + AddChunkMapping(BulkDataId, BulkDataHash); + + Log().debug("bulkdata {} -> {}", BulkDataId, BulkDataHash); + } + + if (CbFieldView FilesArray = Core["files"sv]) + { + int FileCount = 0; + int ServerFileCount = 0; + + std::atomic<bool> InvalidOp{false}; + + Stopwatch Timer; + + std::future<void> f0 = std::async(std::launch::async, [&] { + for (CbFieldView& Entry : FilesArray) + { + CbObjectView FileObj = Entry.AsObjectView(); + const Oid FileId = FileObj["id"sv].AsObjectId(); + + if (auto PathField = FileObj["path"sv]) + { + AddFileMapping(FileId, PathField.AsString()); + + // Log().debug("file {} -> {}", FileId, PathString); + + ++FileCount; + } + else + { + // Every file entry needs to specify a path + InvalidOp = true; + break; + } + + if (InvalidOp.load(std::memory_order::relaxed)) + { + break; + } + } + }); + + std::future<void> f1 = std::async(std::launch::async, [&] { + CbArrayView ServerFilesArray = Core["serverfiles"sv].AsArrayView(); + + for (CbFieldView& Entry : ServerFilesArray) + { + CbObjectView FileObj = Entry.AsObjectView(); + const Oid FileId = FileObj["id"sv].AsObjectId(); + + if (auto PathField = FileObj["path"sv]) + { + AddServerFileMapping(FileId, PathField.AsString()); + + // m_log.debug("file {} -> {}", FileId, PathString); + + ++ServerFileCount; + } + else + { + // Every file entry needs to specify a path + InvalidOp = true; + break; + } + + if (InvalidOp.load(std::memory_order::relaxed)) + { + break; + } + } + }); + + f0.wait(); + f1.wait(); + + if (InvalidOp) + { + return kInvalidOp; + } + + if (FileCount || ServerFileCount) + { + Log().debug("{} files registered, {} server files (took {})", + FileCount, + ServerFileCount, + NiceTimeSpanMs(Timer.getElapsedTimeMs())); + + if (FileCount != ServerFileCount) + { + Log().warn("client/server file list mismatch: {} vs {}", FileCount, ServerFileCount); + } + } + } + + for (CbFieldView& Entry : Core["meta"sv]) + { + CbObjectView MetaObj = Entry.AsObjectView(); + const Oid MetaId = MetaObj["id"sv].AsObjectId(); + auto NameString = MetaObj["name"sv].AsString(); + IoHash MetaDataHash = MetaObj["data"sv].AsBinaryAttachment(); + + AddMetaMapping(MetaId, MetaDataHash); + + Log().debug("meta data ({}) {} -> {}", NameString, MetaId, MetaDataHash); + } + + m_OpAddressMap.emplace(OpEntry.OpLsn, OplogEntryAddress{.Offset = OpEntry.OpCoreOffset, .Size = OpEntry.OpCoreSize}); + m_LatestOpMap[OpEntry.OpKeyAsOId()] = OpEntry.OpLsn; + + return OpEntry.OpLsn; +} + +uint32_t +ProjectStore::Oplog::AppendNewOplogEntry(CbPackage OpPackage) +{ + using namespace std::literals; + + const CbObject& Core = OpPackage.GetObject(); + const OplogEntry OpEntry = m_Storage->AppendOp(Core); + + // Persist attachments + + auto Attachments = OpPackage.GetAttachments(); + + for (const auto& Attach : Attachments) + { + SharedBuffer BinaryView = Attach.AsBinaryView(); + m_CasStore.InsertChunk(BinaryView.GetData(), BinaryView.GetSize(), Attach.GetHash()); + } + + return RegisterOplogEntry(Core, OpEntry, kUpdateNewEntry); +} + +////////////////////////////////////////////////////////////////////////// + +ProjectStore::Project::Project(ProjectStore* PrjStore, CasStore& Store, std::filesystem::path BasePath) +: m_ProjectStore(PrjStore) +, m_CasStore(Store) +, m_OplogStoragePath(BasePath) +{ +} + +ProjectStore::Project::~Project() +{ +} + +bool +ProjectStore::Project::Exists(std::filesystem::path BasePath) +{ + return std::filesystem::exists(BasePath / "Project.zcb"); +} + +void +ProjectStore::Project::Read() +{ + std::filesystem::path ProjectStateFilePath = m_OplogStoragePath / "Project.zcb"; + + spdlog::info("reading config for project '{}' from {}", Identifier, ProjectStateFilePath); + + CasBlobFile Blob; + Blob.Open(ProjectStateFilePath, false); + + IoBuffer Obj = Blob.ReadAll(); + CbValidateError ValidationError = ValidateCompactBinary(MemoryView(Obj.Data(), Obj.Size()), CbValidateMode::All); + + if (ValidationError == CbValidateError::None) + { + CbObject Cfg = LoadCompactBinaryObject(Obj); + + Identifier = Cfg["id"].AsString(); + RootDir = Cfg["root"].AsString(); + ProjectRootDir = Cfg["project"].AsString(); + EngineRootDir = Cfg["engine"].AsString(); + } + else + { + spdlog::error("validation error {} hit for '{}'", int(ValidationError), ProjectStateFilePath); + } +} + +void +ProjectStore::Project::Write() +{ + MemoryOutStream Mem; + BinaryWriter Writer(Mem); + + CbObjectWriter Cfg; + Cfg << "id" << Identifier; + Cfg << "root" << WideToUtf8(RootDir.c_str()); + Cfg << "project" << ProjectRootDir; + Cfg << "engine" << EngineRootDir; + + Cfg.Save(Writer); + + CreateDirectories(m_OplogStoragePath); + + std::filesystem::path ProjectStateFilePath = m_OplogStoragePath / "Project.zcb"; + + spdlog::info("persisting config for project '{}' to {}", Identifier, ProjectStateFilePath); + + CasBlobFile Blob; + Blob.Open(ProjectStateFilePath, true); + Blob.Write(Mem.Data(), Mem.Size(), 0); + Blob.Flush(); +} + +spdlog::logger& +ProjectStore::Project::Log() +{ + return m_ProjectStore->Log(); +} + +std::filesystem::path +ProjectStore::Project::BasePathForOplog(std::string_view OplogId) +{ + return m_OplogStoragePath / OplogId; +} + +ProjectStore::Oplog* +ProjectStore::Project::NewOplog(std::string_view OplogId) +{ + RwLock::ExclusiveLockScope _(m_ProjectLock); + + std::filesystem::path OplogBasePath = BasePathForOplog(OplogId); + + try + { + Oplog& Log = m_Oplogs.try_emplace(std::string{OplogId}, OplogId, this, m_CasStore, OplogBasePath).first->second; + + return &Log; + } + catch (std::exception&) + { + // In case of failure we need to ensure there's no half constructed entry around + // + // (This is probably already ensured by the try_emplace implementation?) + + m_Oplogs.erase(std::string{OplogId}); + + return nullptr; + } +} + +ProjectStore::Oplog* +ProjectStore::Project::OpenOplog(std::string_view OplogId) +{ + { + RwLock::SharedLockScope _(m_ProjectLock); + + auto OplogIt = m_Oplogs.find(std::string(OplogId)); + + if (OplogIt != m_Oplogs.end()) + { + return &OplogIt->second; + } + } + + RwLock::ExclusiveLockScope _(m_ProjectLock); + + std::filesystem::path OplogBasePath = BasePathForOplog(OplogId); + + if (Oplog::ExistsAt(OplogBasePath)) + { + // Do open of existing oplog + + try + { + Oplog& Log = m_Oplogs.try_emplace(std::string{OplogId}, OplogId, this, m_CasStore, OplogBasePath).first->second; + + Log.ReplayLog(); + + return &Log; + } + catch (std::exception& ex) + { + spdlog::error("failed to open oplog '{}' @ '{}': {}", OplogId, OplogBasePath, ex.what()); + + m_Oplogs.erase(std::string{OplogId}); + } + } + + return nullptr; +} + +void +ProjectStore::Project::DeleteOplog(std::string_view OplogId) +{ + bool Exists = false; + + { + RwLock::ExclusiveLockScope _(m_ProjectLock); + + auto OplogIt = m_Oplogs.find(std::string(OplogId)); + + if (OplogIt != m_Oplogs.end()) + { + Exists = true; + + m_Oplogs.erase(OplogIt); + } + } + + // Actually erase + + std::filesystem::path OplogBasePath = BasePathForOplog(OplogId); + + OplogStorage::Delete(OplogBasePath); +} + +void +ProjectStore::Project::IterateOplogs(std::function<void(const Oplog&)>&& Fn) const +{ + // TODO: should iterate over oplogs which are present on disk but not yet loaded + + RwLock::SharedLockScope _(m_ProjectLock); + + for (auto& Kv : m_Oplogs) + { + Fn(Kv.second); + } +} + +////////////////////////////////////////////////////////////////////////// + +ProjectStore::ProjectStore(CasStore& Store, std::filesystem::path BasePath) +: m_Log("project", begin(spdlog::default_logger()->sinks()), end(spdlog::default_logger()->sinks())) +, m_ProjectBasePath(BasePath) +, m_CasStore(Store) +{ + m_Log.info("initializing project store at '{}'", BasePath); + m_Log.set_level(spdlog::level::debug); +} + +ProjectStore::~ProjectStore() +{ + m_Log.info("closing project store ('{}')", m_ProjectBasePath); +} + +std::filesystem::path +ProjectStore::BasePathForProject(std::string_view ProjectId) +{ + return m_ProjectBasePath / ProjectId; +} + +ProjectStore::Project* +ProjectStore::OpenProject(std::string_view ProjectId) +{ + { + RwLock::SharedLockScope _(m_ProjectsLock); + + auto ProjIt = m_Projects.find(std::string{ProjectId}); + + if (ProjIt != m_Projects.end()) + { + return &(ProjIt->second); + } + } + + RwLock::ExclusiveLockScope _(m_ProjectsLock); + + std::filesystem::path ProjectBasePath = BasePathForProject(ProjectId); + + if (Project::Exists(ProjectBasePath)) + { + try + { + Log().info("opening project {} @ {}", ProjectId, ProjectBasePath); + + ProjectStore::Project& Prj = m_Projects.try_emplace(std::string{ProjectId}, this, m_CasStore, ProjectBasePath).first->second; + Prj.Read(); + return &Prj; + } + catch (std::exception& e) + { + Log().warn("failed to open {} @ {} ({})", ProjectId, ProjectBasePath, e.what()); + m_Projects.erase(std::string{ProjectId}); + } + } + + return nullptr; +} + +ProjectStore::Project* +ProjectStore::NewProject(std::filesystem::path BasePath, + std::string_view ProjectId, + std::string_view RootDir, + std::string_view EngineRootDir, + std::string_view ProjectRootDir) +{ + RwLock::ExclusiveLockScope _(m_ProjectsLock); + + ProjectStore::Project& Prj = m_Projects.try_emplace(std::string{ProjectId}, this, m_CasStore, BasePath).first->second; + Prj.Identifier = ProjectId; + Prj.RootDir = RootDir; + Prj.EngineRootDir = EngineRootDir; + Prj.ProjectRootDir = ProjectRootDir; + Prj.Write(); + + return &Prj; +} + +void +ProjectStore::DeleteProject(std::string_view ProjectId) +{ + std::filesystem::path ProjectBasePath = BasePathForProject(ProjectId); + + Log().info("deleting project {} @ {}", ProjectId, ProjectBasePath); + + m_Projects.erase(std::string{ProjectId}); + + DeleteDirectories(ProjectBasePath); +} + +bool +ProjectStore::Exists(std::string_view ProjectId) +{ + return Project::Exists(BasePathForProject(ProjectId)); +} + +ProjectStore::Oplog* +ProjectStore::OpenProjectOplog(std::string_view ProjectId, std::string_view OplogId) +{ + if (Project* ProjectIt = OpenProject(ProjectId)) + { + return ProjectIt->OpenOplog(OplogId); + } + + return nullptr; +} + +////////////////////////////////////////////////////////////////////////// + +HttpProjectService::HttpProjectService(CasStore& Store, ProjectStore* Projects) +: m_CasStore(Store) +, m_Log("project", begin(spdlog::default_logger()->sinks()), end(spdlog::default_logger()->sinks())) +, m_ProjectStore(Projects) +{ + using namespace std::literals; + + m_Router.AddPattern("project", "([[:alnum:]_.]+)"); + m_Router.AddPattern("log", "([[:alnum:]_.]+)"); + m_Router.AddPattern("op", "([[:digit:]]+?)"); + m_Router.AddPattern("chunk", "([[:xdigit:]]{24})"); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/batch", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + + m_Log.info("batch - {} / {}", ProjectId, OplogId); + + ProjectStore::Oplog* FoundLog = m_ProjectStore->OpenProjectOplog(ProjectId, OplogId); + + if (FoundLog == nullptr) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + ProjectStore::Oplog& Log = *FoundLog; + + // Parse Request + + IoBuffer Payload = HttpReq.ReadPayload(); + MemoryInStream MemIn(Payload.Data(), Payload.Size()); + BinaryReader Reader(MemIn); + + struct RequestHeader + { + enum + { + kMagic = 0xAAAA'77AC + }; + uint32_t Magic; + uint32_t ChunkCount; + uint32_t Reserved1; + uint32_t Reserved2; + }; + + struct RequestChunkEntry + { + Oid ChunkId; + uint32_t CorrelationId; + uint64_t Offset; + uint64_t RequestBytes; + }; + + if (Payload.Size() <= sizeof(RequestHeader)) + { + HttpReq.WriteResponse(HttpResponse::BadRequest); + } + + RequestHeader Hdr; + Reader.Read(&Hdr, sizeof Hdr); + + if (Hdr.Magic != RequestHeader::kMagic) + { + HttpReq.WriteResponse(HttpResponse::BadRequest); + } + + // Make Response + + MemoryOutStream MemOut; + BinaryWriter MemWriter(MemOut); + + struct ResponseHeader + { + uint32_t Magic = 0xbada'b00f; + uint32_t ChunkCount; + uint32_t Reserved1 = 0; + uint32_t Reserved2 = 0; + }; + + struct ResponseChunkEntry + { + uint32_t CorrelationId; + uint32_t Flags = 0; + uint64_t ChunkSize; + }; + + return HttpReq.WriteResponse(HttpResponse::NotFound); + }, + HttpVerb::kPost); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/files", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + // File manifest fetch, returns the client file list + + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + + ProjectStore::Oplog* FoundLog = m_ProjectStore->OpenProjectOplog(ProjectId, OplogId); + + if (FoundLog == nullptr) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + ProjectStore::Oplog& Log = *FoundLog; + + CbObjectWriter Response; + Response.BeginArray("files"); + + Log.IterateFileMap([&](const Oid& Id, const std::string_view& Path) { + Response.BeginObject(); + Response << "id" << Id; + Response << "path" << Path; + Response.EndObject(); + }); + + Response.EndArray(); + + return HttpReq.WriteResponse(HttpResponse::OK, Response.Save()); + }, + HttpVerb::kGet); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/{chunk}/info", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + const auto& ChunkId = Req.GetCapture(3); + + ProjectStore::Oplog* FoundLog = m_ProjectStore->OpenProjectOplog(ProjectId, OplogId); + + if (FoundLog == nullptr) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + ProjectStore::Oplog& Log = *FoundLog; + + Oid Obj = Oid::FromHexString(ChunkId); + + IoBuffer Value = Log.FindChunk(Obj); + + if (Value) + { + CbObjectWriter Response; + Response << "size" << Value.Size(); + return HttpReq.WriteResponse(HttpResponse::OK, Response.Save()); + } + + return HttpReq.WriteResponse(HttpResponse::NotFound); + }, + HttpVerb::kGet); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/{chunk}", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + const auto& ChunkId = Req.GetCapture(3); + + bool IsOffset = false; + uint64_t Offset = 0; + uint64_t Size = ~(0ull); + + auto QueryParms = Req.ServerRequest().GetQueryParams(); + + if (auto OffsetParm = QueryParms.GetValue("offset"); OffsetParm.empty() == false) + { + if (auto OffsetVal = ParseInt<uint64_t>(OffsetParm)) + { + Offset = OffsetVal.value(); + IsOffset = true; + } + else + { + return HttpReq.WriteResponse(HttpResponse::BadRequest); + } + } + + if (auto SizeParm = QueryParms.GetValue("size"); SizeParm.empty() == false) + { + if (auto SizeVal = ParseInt<uint64_t>(SizeParm)) + { + Size = SizeVal.value(); + IsOffset = true; + } + else + { + return HttpReq.WriteResponse(HttpResponse::BadRequest); + } + } + + m_Log.debug("chunk - {} / {} / {}", ProjectId, OplogId, ChunkId); + + ProjectStore::Oplog* FoundLog = m_ProjectStore->OpenProjectOplog(ProjectId, OplogId); + + if (FoundLog == nullptr) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + ProjectStore::Oplog& Log = *FoundLog; + + Oid Obj = Oid::FromHexString(ChunkId); + + IoBuffer Value = Log.FindChunk(Obj); + + switch (HttpVerb Verb = HttpReq.RequestVerb()) + { + case HttpVerb::kHead: + case HttpVerb::kGet: + if (!Value) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + if (Verb == HttpVerb::kHead) + { + HttpReq.SetSuppressResponseBody(); + } + + if (IsOffset) + { + if (Offset > Value.Size()) + { + Offset = Value.Size(); + } + + if ((Offset + Size) > Value.Size()) + { + Size = Value.Size() - Offset; + } + + // Send only a subset of data + IoBuffer InnerValue(Value, Offset, Size); + + return HttpReq.WriteResponse(HttpResponse::OK, HttpContentType::kBinary, InnerValue); + } + + return HttpReq.WriteResponse(HttpResponse::OK, HttpContentType::kBinary, Value); + } + }, + HttpVerb::kGet | HttpVerb::kHead); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/new", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + + ProjectStore::Oplog* FoundLog = m_ProjectStore->OpenProjectOplog(ProjectId, OplogId); + + if (FoundLog == nullptr) + { + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + ProjectStore::Oplog& Log = *FoundLog; + + IoBuffer Payload = HttpReq.ReadPayload(); + + CbPackage Package; + Package.Load(Payload); + + CbObject Core = Package.GetObject(); + + if (!Core["key"sv]) + { + return HttpReq.WriteResponse(HttpResponse::BadRequest, HttpContentType::kText, "No oplog entry key specified"); + } + + // Write core to oplog + + const uint32_t OpLsn = Log.AppendNewOplogEntry(Package); + + if (OpLsn == ProjectStore::Oplog::kInvalidOp) + { + return HttpReq.WriteResponse(HttpResponse::BadRequest); + } + + m_Log.info("new op #{:4} - {}/{} ({:>6}) {}", OpLsn, ProjectId, OplogId, NiceBytes(Payload.Size()), Core["key"sv].AsString()); + + HttpReq.WriteResponse(HttpResponse::Created); + }, + HttpVerb::kPost); + + m_Router.RegisterRoute( + "{project}/oplog/{log}/{op}", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + // TODO: look up op and respond with the payload! + + HttpReq.WriteResponse(HttpResponse::Accepted, HttpContentType::kText, u8"yeee"sv); + }, + HttpVerb::kGet); + + using namespace fmt::literals; + + m_Router.RegisterRoute( + "{project}/oplog/{log}", + [this](HttpRouterRequest& Req) { + const auto& ProjectId = Req.GetCapture(1); + const auto& OplogId = Req.GetCapture(2); + + ProjectStore::Project* ProjectIt = m_ProjectStore->OpenProject(ProjectId); + + if (!ProjectIt) + { + return Req.ServerRequest().WriteResponse(HttpResponse::NotFound, + HttpContentType::kText, + "project {} not found"_format(ProjectId)); + } + + ProjectStore::Project& Prj = *ProjectIt; + + switch (Req.ServerRequest().RequestVerb()) + { + case HttpVerb::kGet: + { + ProjectStore::Oplog* OplogIt = Prj.OpenOplog(OplogId); + + if (!OplogIt) + { + return Req.ServerRequest().WriteResponse(HttpResponse::NotFound, + HttpContentType::kText, + "oplog {} not found in project {}"_format(OplogId, ProjectId)); + } + + ProjectStore::Oplog& Log = *OplogIt; + + CbObjectWriter Cb; + Cb << "id"sv << Log.OplogId() << "project"sv << Prj.Identifier << "tempdir"sv << Log.TempDir(); + + Req.ServerRequest().WriteResponse(HttpResponse::OK, Cb.Save()); + } + break; + + case HttpVerb::kPost: + { + ProjectStore::Oplog* OplogIt = Prj.OpenOplog(OplogId); + + if (!OplogIt) + { + if (!Prj.NewOplog(OplogId)) + { + // TODO: indicate why the operation failed! + return Req.ServerRequest().WriteResponse(HttpResponse::InternalServerError); + } + + m_Log.info("established oplog {} / {}", ProjectId, OplogId); + + return Req.ServerRequest().WriteResponse(HttpResponse::Created); + } + + // I guess this should ultimately be used to execute RPCs but for now, it + // does absolutely nothing + + return Req.ServerRequest().WriteResponse(HttpResponse::BadRequest); + } + break; + + case HttpVerb::kDelete: + { + spdlog::info("deleting oplog {}/{}", ProjectId, OplogId); + + ProjectIt->DeleteOplog(OplogId); + + return Req.ServerRequest().WriteResponse(HttpResponse::OK); + } + break; + } + }, + HttpVerb::kPost | HttpVerb::kGet | HttpVerb::kDelete); + + m_Router.RegisterRoute( + "{project}", + [this](HttpRouterRequest& Req) { + const std::string ProjectId = Req.GetCapture(1); + + switch (Req.ServerRequest().RequestVerb()) + { + case HttpVerb::kPost: + { + IoBuffer Payload = Req.ServerRequest().ReadPayload(); + CbObject Params = LoadCompactBinaryObject(Payload); + std::string_view Id = Params["id"sv].AsString(); + std::string_view Root = Params["root"sv].AsString(); + std::string_view EngineRoot = Params["engine"sv].AsString(); + std::string_view ProjectRoot = Params["project"sv].AsString(); + + const std::filesystem::path BasePath = m_ProjectStore->BasePath() / ProjectId; + m_ProjectStore->NewProject(BasePath, ProjectId, Root, EngineRoot, ProjectRoot); + + m_Log.info("established project - {} (id: '{}', roots: '{}', '{}', '{}')", + ProjectId, + Id, + Root, + EngineRoot, + ProjectRoot); + + Req.ServerRequest().WriteResponse(HttpResponse::Created); + } + break; + + case HttpVerb::kGet: + { + ProjectStore::Project* ProjectIt = m_ProjectStore->OpenProject(ProjectId); + + if (!ProjectIt) + { + return Req.ServerRequest().WriteResponse(HttpResponse::NotFound, + HttpContentType::kText, + "project {} not found"_format(ProjectId)); + } + + const ProjectStore::Project& Prj = *ProjectIt; + + CbObjectWriter Response; + Response << "id" << Prj.Identifier << "root" << WideToUtf8(Prj.RootDir.c_str()); + + Response.BeginArray("oplogs"sv); + Prj.IterateOplogs([&](const ProjectStore::Oplog& I) { Response << "id"sv << I.OplogId(); }); + Response.EndArray(); // oplogs + + Req.ServerRequest().WriteResponse(HttpResponse::OK, Response.Save()); + } + break; + + case HttpVerb::kDelete: + { + ProjectStore::Project* ProjectIt = m_ProjectStore->OpenProject(ProjectId); + + if (!ProjectIt) + { + return Req.ServerRequest().WriteResponse(HttpResponse::NotFound, + HttpContentType::kText, + "project {} not found"_format(ProjectId)); + } + + m_ProjectStore->DeleteProject(ProjectId); + } + break; + } + }, + HttpVerb::kGet | HttpVerb::kPost | HttpVerb::kDelete); +} + +HttpProjectService::~HttpProjectService() +{ +} + +const char* +HttpProjectService::BaseUri() const +{ + return "/prj/"; +} + +void +HttpProjectService::HandleRequest(HttpServerRequest& Request) +{ + if (m_Router.HandleRequest(Request) == false) + { + m_Log.warn("No route found for {0}", Request.RelativeUri()); + } +} + +////////////////////////////////////////////////////////////////////////// + +class SecurityAttributes +{ +public: + inline SECURITY_ATTRIBUTES* Attributes() { return &m_Attributes; } + +protected: + SECURITY_ATTRIBUTES m_Attributes{}; + SECURITY_DESCRIPTOR m_Sd{}; +}; + +// Security attributes which allows any user access + +class AnyUserSecurityAttributes : public SecurityAttributes +{ +public: + AnyUserSecurityAttributes() + { + m_Attributes.nLength = sizeof m_Attributes; + m_Attributes.bInheritHandle = false; // Disable inheritance + + const BOOL success = InitializeSecurityDescriptor(&m_Sd, SECURITY_DESCRIPTOR_REVISION); + + if (success) + { + const BOOL bSetOk = SetSecurityDescriptorDacl(&m_Sd, TRUE, (PACL)NULL, FALSE); + if (bSetOk) + { + m_Attributes.lpSecurityDescriptor = &m_Sd; + } + } + } +}; + +////////////////////////////////////////////////////////////////////////// + +struct LocalProjectService::LocalProjectImpl +{ + LocalProjectImpl() : m_WorkerThreadPool(ServiceThreadCount) {} + ~LocalProjectImpl() { Stop(); } + + void Start() + { + ZEN_ASSERT(!m_IsStarted); + + for (int i = 0; i < 32; ++i) + { + PipeConnection* NewPipe = new PipeConnection(this); + m_ServicePipes.push_back(NewPipe); + m_IoContext.post([NewPipe] { NewPipe->Accept(); }); + } + + for (int i = 0; i < ServiceThreadCount; ++i) + { + asio::post(m_WorkerThreadPool, [this] { + try + { + m_IoContext.run(); + } + catch (std::exception& ex) + { + spdlog::error("exception caught in pipe project service loop: {}", ex.what()); + } + + m_ShutdownLatch.count_down(); + }); + } + + m_IsStarted = true; + } + + void Stop() + { + if (!m_IsStarted) + { + return; + } + + for (PipeConnection* Pipe : m_ServicePipes) + { + Pipe->Disconnect(); + } + + m_IoContext.stop(); + m_ShutdownLatch.wait(); + + for (PipeConnection* Pipe : m_ServicePipes) + { + delete Pipe; + } + + m_ServicePipes.clear(); + } + +private: + asio::io_context& IoContext() { return m_IoContext; } + auto PipeSecurityAttributes() { return m_AnyUserSecurityAttributes.Attributes(); } + static const int ServiceThreadCount = 4; + + std::latch m_ShutdownLatch{ServiceThreadCount}; + asio::thread_pool m_WorkerThreadPool; + asio::io_context m_IoContext; + + class PipeConnection + { + enum PipeState + { + kUninitialized, + kConnecting, + kReading, + kWriting, + kDisconnected, + kInvalid + }; + + LocalProjectImpl* m_Outer; + asio::windows::stream_handle m_PipeHandle; + std::atomic<PipeState> m_PipeState{kUninitialized}; + + public: + PipeConnection(LocalProjectImpl* Outer) : m_Outer(Outer), m_PipeHandle{m_Outer->IoContext()} {} + ~PipeConnection() {} + + void Disconnect() + { + m_PipeState = kDisconnected; + DisconnectNamedPipe(m_PipeHandle.native_handle()); + } + + void Accept() + { + StringBuilder<64> PipeName; + PipeName << "\\\\.\\pipe\\zenprj"; // TODO: this should use an instance-specific identifier! + + HANDLE hPipe = CreateNamedPipeA(PipeName.c_str(), + PIPE_ACCESS_DUPLEX | FILE_FLAG_OVERLAPPED, + PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT, + PIPE_UNLIMITED_INSTANCES, // Max instance count + 65536, // Output buffer size + 65536, // Input buffer size + 10'000, // Default timeout (ms) + m_Outer->PipeSecurityAttributes() // Security attributes + ); + + if (hPipe == INVALID_HANDLE_VALUE) + { + spdlog::warn("failed while creating named pipe {}", PipeName.c_str()); + + // TODO: error - how to best handle? + } + + m_PipeHandle.assign(hPipe); // This now owns the handle and will close it + + m_PipeState = kConnecting; + + asio::windows::overlapped_ptr OverlappedPtr( + m_PipeHandle.get_executor(), + std::bind(&PipeConnection::OnClientConnect, this, std::placeholders::_1, std::placeholders::_2)); + + OVERLAPPED* Overlapped = OverlappedPtr.get(); + BOOL Ok = ConnectNamedPipe(hPipe, Overlapped); + DWORD LastError = GetLastError(); + + if (!Ok && LastError != ERROR_IO_PENDING) + { + m_PipeState = kInvalid; + + // The operation completed immediately, so a completion notification needs + // to be posted. When complete() is called, ownership of the OVERLAPPED- + // derived object passes to the io_service. + std::error_code Ec(LastError, asio::error::get_system_category()); + OverlappedPtr.complete(Ec, 0); + } + else + { + // The operation was successfully initiated, so ownership of the + // OVERLAPPED-derived object has now passed to the io_service. + OverlappedPtr.release(); + } + } + + private: + void OnClientConnect(const std::error_code& Ec, size_t BytesTransferred) + { + ZEN_UNUSED(BytesTransferred); + + if (Ec) + { + if (m_PipeState == kDisconnected) + { + return; + } + + spdlog::warn("pipe connection error: {}", Ec.message()); + + // TODO: should disconnect and issue a new connect + return; + } + + spdlog::debug("pipe connection established"); + + IssueRead(); + } + + void IssueRead() + { + m_PipeState = kReading; + + m_PipeHandle.async_read_some(asio::mutable_buffer(m_MsgBuffer, sizeof m_MsgBuffer), + std::bind(&PipeConnection::OnClientRead, this, std::placeholders::_1, std::placeholders::_2)); + } + + void OnClientRead(const std::error_code& Ec, size_t Bytes) + { + if (Ec) + { + if (m_PipeState == kDisconnected) + { + return; + } + + spdlog::warn("pipe read error: {}", Ec.message()); + + // TODO: should disconnect and issue a new connect + return; + } + + spdlog::debug("received message: {} bytes", Bytes); + + // TODO: Actually process request + + m_PipeState = kWriting; + + asio::async_write(m_PipeHandle, + asio::buffer(m_MsgBuffer, Bytes), + std::bind(&PipeConnection::OnWriteCompletion, this, std::placeholders::_1, std::placeholders::_2)); + } + + void OnWriteCompletion(const std::error_code& Ec, size_t Bytes) + { + ZEN_UNUSED(Bytes); + + if (Ec) + { + if (m_PipeState == kDisconnected) + { + return; + } + + spdlog::warn("pipe write error: {}", Ec.message()); + + // TODO: should disconnect and issue a new connect + return; + } + + // Go back to reading + IssueRead(); + } + + uint8_t m_MsgBuffer[16384]; + }; + + AnyUserSecurityAttributes m_AnyUserSecurityAttributes; + std::vector<PipeConnection*> m_ServicePipes; + bool m_IsStarted = false; +}; + +LocalProjectService::LocalProjectService(CasStore& Store, ProjectStore* Projects) : m_CasStore(Store), m_ProjectStore(Projects) +{ + m_Impl = std::make_unique<LocalProjectImpl>(); + m_Impl->Start(); +} + +LocalProjectService::~LocalProjectService() +{ + m_Impl->Stop(); +} + +////////////////////////////////////////////////////////////////////////// + +} // namespace zen diff --git a/zenserver/projectstore.h b/zenserver/projectstore.h new file mode 100644 index 000000000..4ad0e42e0 --- /dev/null +++ b/zenserver/projectstore.h @@ -0,0 +1,241 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> +#include <zencore/uid.h> +#include <zencore/xxhash.h> +#include <zenstore/cas.h> +#include <zenstore/caslog.h> + +#include <spdlog/spdlog.h> +#include <tsl/robin_map.h> +#include <filesystem> +#include <map> +#include <string> + +namespace zen { + +class CbPackage; + +struct OplogEntry +{ + uint32_t OpLsn; + uint32_t OpCoreOffset; // note: Multiple of alignment! + uint32_t OpCoreSize; + uint32_t OpCoreHash; // Used as checksum + XXH3_128 OpKeyHash; // XXH128_canonical_t + + inline Oid OpKeyAsOId() const + { + Oid Id; + memcpy(Id.OidBits, &OpKeyHash, sizeof Id.OidBits); + return Id; + } +}; + +static_assert(IsPow2(sizeof(OplogEntry))); + +/** Project Store + */ +class ProjectStore : public RefCounted +{ + struct OplogStorage; + +public: + ProjectStore(CasStore& Store, std::filesystem::path BasePath); + ~ProjectStore(); + + struct Project; + + struct Oplog + { + Oplog(std::string_view Id, Project* Outer, CasStore& Store, std::filesystem::path BasePath); + ~Oplog(); + + [[nodiscard]] static bool ExistsAt(std::filesystem::path BasePath); + + void IterateFileMap(std::function<void(const Oid&, const std::string_view&)>&& Fn); + + IoBuffer FindChunk(Oid ChunkId); + + inline static const uint32_t kInvalidOp = ~0u; + + /** Persist a new oplog entry + * + * Returns the oplog LSN assigned to the new entry, or kInvalidOp if the entry is rejected + */ + uint32_t AppendNewOplogEntry(CbPackage Op); + + enum UpdateType + { + kUpdateNewEntry, + kUpdateReplay + }; + + /** Update tracking metadata for a new oplog entry + * + * This is used during replay (and gets called as part of new op append) + * + * Returns the oplog LSN assigned to the new entry, or kInvalidOp if the entry is rejected + */ + uint32_t RegisterOplogEntry(CbObject Core, const OplogEntry& OpEntry, UpdateType TypeOfUpdate); + + /** Scan oplog and register each entry, thus updating the in-memory tracking tables + */ + void ReplayLog(); + + const std::string& OplogId() const { return m_OplogId; } + + const std::wstring& TempDir() const { return m_TempPath.native(); } + + spdlog::logger& Log() { return m_OuterProject->Log(); } + + private: + struct OplogEntryAddress + { + uint64_t Offset; + uint64_t Size; + }; + + template<class V> + using OidMap = tsl::robin_map<Oid, V, Oid::Hasher>; + + Project* m_OuterProject = nullptr; + RwLock m_OplogLock; + CasStore& m_CasStore; + std::filesystem::path m_BasePath; + std::filesystem::path m_TempPath; + + OidMap<IoHash> m_ChunkMap; // output data chunk id -> CAS address + OidMap<IoHash> m_MetaMap; // meta chunk id -> CAS address + OidMap<std::string> m_FileMap; // file id -> client file + OidMap<std::string> m_ServerFileMap; // file id -> server file + std::map<int, OplogEntryAddress> m_OpAddressMap; // Index LSN -> op data in ops blob file + OidMap<int> m_LatestOpMap; // op key -> latest op LSN for key + + RefPtr<OplogStorage> m_Storage; + std::string m_OplogId; + + void AddFileMapping(Oid FileId, std::string_view Path); + void AddServerFileMapping(Oid FileId, std::string_view Path); + void AddChunkMapping(Oid ChunkId, IoHash Hash); + void AddMetaMapping(Oid ChunkId, IoHash Hash); + }; + + struct Project + { + std::string Identifier; + std::filesystem::path RootDir; + std::string EngineRootDir; + std::string ProjectRootDir; + + Oplog* NewOplog(std::string_view OplogId); + Oplog* OpenOplog(std::string_view OplogId); + void DeleteOplog(std::string_view OplogId); + void IterateOplogs(std::function<void(const Oplog&)>&& Fn) const; + + Project(ProjectStore* PrjStore, CasStore& Store, std::filesystem::path BasePath); + ~Project(); + + void Read(); + void Write(); + [[nodiscard]] static bool Exists(std::filesystem::path BasePath); + + spdlog::logger& Log(); + + private: + ProjectStore* m_ProjectStore; + CasStore& m_CasStore; + mutable RwLock m_ProjectLock; + std::map<std::string, Oplog> m_Oplogs; + std::filesystem::path m_OplogStoragePath; + + std::filesystem::path BasePathForOplog(std::string_view OplogId); + }; + + Oplog* OpenProjectOplog(std::string_view ProjectId, std::string_view OplogId); + + Project* OpenProject(std::string_view ProjectId); + Project* NewProject(std::filesystem::path BasePath, + std::string_view ProjectId, + std::string_view RootDir, + std::string_view EngineRootDir, + std::string_view ProjectRootDir); + void DeleteProject(std::string_view ProjectId); + bool Exists(std::string_view ProjectId); + + spdlog::logger& Log() { return m_Log; } + const std::filesystem::path& BasePath() const { return m_ProjectBasePath; } + +private: + spdlog::logger m_Log; + CasStore& m_CasStore; + std::filesystem::path m_ProjectBasePath; + RwLock m_ProjectsLock; + std::map<std::string, Project> m_Projects; + + std::filesystem::path BasePathForProject(std::string_view ProjectId); +}; + +////////////////////////////////////////////////////////////////////////// +// +// {ns} a root namespace, should be associated with the project which owns it +// {target} a variation of the project, typically a build target +// {lsn} oplog entry sequence number +// +// /prj/{ns} +// /prj/{ns}/oplog/{target} +// /prj/{ns}/oplog/{target}/{lsn} +// +// oplog entry +// +// id: {id} +// key: {} +// meta: {} +// data: [] +// refs: +// + +class HttpProjectService : public HttpService +{ +public: + HttpProjectService(CasStore& Store, ProjectStore* Projects); + ~HttpProjectService(); + + virtual const char* BaseUri() const override; + virtual void HandleRequest(HttpServerRequest& Request) override; + +private: + CasStore& m_CasStore; + spdlog::logger m_Log; + HttpRequestRouter m_Router; + Ref<ProjectStore> m_ProjectStore; +}; + +/** Project store interface for local clients + * + * This provides the same functionality as the HTTP interface but with + * some optimizations which are only possible for clients running on the + * same host as the Zen Store instance + * + */ + +class LocalProjectService : public RefCounted +{ +protected: + LocalProjectService(CasStore& Store, ProjectStore* Projects); + ~LocalProjectService(); + +public: + static inline Ref<LocalProjectService> New(CasStore& Store, ProjectStore* Projects) { return new LocalProjectService(Store, Projects); } + +private: + struct LocalProjectImpl; + + CasStore& m_CasStore; + Ref<ProjectStore> m_ProjectStore; + std::unique_ptr<LocalProjectImpl> m_Impl; +}; + +} // namespace zen diff --git a/zenserver/targetver.h b/zenserver/targetver.h new file mode 100644 index 000000000..d432d6993 --- /dev/null +++ b/zenserver/targetver.h @@ -0,0 +1,10 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and +// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. + +#include <SDKDDKVer.h> diff --git a/zenserver/testing/launch.cpp b/zenserver/testing/launch.cpp new file mode 100644 index 000000000..119055e44 --- /dev/null +++ b/zenserver/testing/launch.cpp @@ -0,0 +1,490 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "launch.h" + +#include <zencore/compactbinary.h> +#include <zencore/compactbinarybuilder.h> +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/windows.h> +#include <zenstore/CAS.h> + +#include <AccCtrl.h> +#include <AclAPI.h> +#include <sddl.h> + +#include <UserEnv.h> +#pragma comment(lib, "UserEnv.lib") + +#include <atlbase.h> +#include <filesystem> +#include <span> + +using namespace std::literals; + +namespace zen { + +struct BasicJob +{ +public: + BasicJob() = default; + ~BasicJob(); + + void SetWorkingDirectory(const std::filesystem::path& WorkingDirectory) { m_WorkingDirectory = WorkingDirectory; } + bool SpawnJob(std::filesystem::path ExePath, std::wstring CommandLine); + bool Wait(uint32_t TimeoutMs = ~0); + +private: + std::filesystem::path m_WorkingDirectory; + int m_ProcessId = 0; + CHandle m_ProcessHandle; +}; + +BasicJob::~BasicJob() +{ + Wait(); +} + +bool +BasicJob::SpawnJob(std::filesystem::path ExePath, std::wstring CommandLine) +{ + using namespace fmt::literals; + + STARTUPINFOEX StartupInfo = {sizeof(STARTUPINFOEX)}; + PROCESS_INFORMATION ProcessInfo{}; + + std::wstring ExePathNative = ExePath.native(); + std::wstring WorkingDirNative = m_WorkingDirectory.native(); + + BOOL Created = ::CreateProcess(ExePathNative.data() /* ApplicationName */, + CommandLine.data() /* Command Line */, + nullptr /* Process Attributes */, + nullptr /* Security Attributes */, + FALSE /* InheritHandles */, + 0 /* Flags */, + nullptr /* Environment */, + WorkingDirNative.data() /* Current Directory */, + (LPSTARTUPINFO)&StartupInfo, + &ProcessInfo); + + if (!Created) + { + throw std::system_error(::GetLastError(), std::system_category(), "Failed to create process '{}'"_format(ExePath).c_str()); + } + + m_ProcessId = ProcessInfo.dwProcessId; + m_ProcessHandle.Attach(ProcessInfo.hProcess); + ::CloseHandle(ProcessInfo.hThread); + + spdlog::info("Created process {}", m_ProcessId); + + return true; +} + +bool +BasicJob::Wait(uint32_t TimeoutMs) +{ + if (!m_ProcessHandle) + { + return true; + } + + DWORD WaitResult = WaitForSingleObject(m_ProcessHandle, TimeoutMs); + + if (WaitResult == WAIT_TIMEOUT) + { + return false; + } + + if (WaitResult == WAIT_OBJECT_0) + { + return true; + } + + throw std::exception("Failed wait on process handle"); +} + +struct SandboxedJob +{ + SandboxedJob() = default; + ~SandboxedJob() = default; + + void SetWorkingDirectory(const std::filesystem::path& WorkingDirectory) { m_WorkingDirectory = WorkingDirectory; } + void Initialize(std::string_view AppContainerId); + bool SpawnJob(std::filesystem::path ExePath); + void AddWhitelistFile(const std::filesystem::path& FilePath) { m_WhitelistFiles.push_back(FilePath); } + +private: + bool GrantNamedObjectAccess(PWSTR Name, SE_OBJECT_TYPE Type, ACCESS_MASK AccessMask, bool Recursive); + + std::filesystem::path m_WorkingDirectory; + std::vector<std::filesystem::path> m_WhitelistFiles; + std::vector<std::wstring> m_WhitelistRegistryKeys; + PSID m_AppContainerSid = nullptr; + bool m_IsInitialized = false; +}; + +bool +SandboxedJob::GrantNamedObjectAccess(PWSTR ObjectName, SE_OBJECT_TYPE ObjectType, ACCESS_MASK AccessMask, bool Recursive) +{ + DWORD Status; + PACL NewAcl = nullptr; + + DWORD grfInhericance = 0; + + if (Recursive) + { + grfInhericance = OBJECT_INHERIT_ACE | CONTAINER_INHERIT_ACE; + } + + EXPLICIT_ACCESS Access{.grfAccessPermissions = AccessMask, + .grfAccessMode = GRANT_ACCESS, + .grfInheritance = grfInhericance, + .Trustee = {.pMultipleTrustee = nullptr, + .MultipleTrusteeOperation = NO_MULTIPLE_TRUSTEE, + .TrusteeForm = TRUSTEE_IS_SID, + .TrusteeType = TRUSTEE_IS_GROUP, + .ptstrName = (PWSTR)m_AppContainerSid}}; + + PACL OldAcl = nullptr; + + Status = GetNamedSecurityInfo(ObjectName /* ObjectName */, + ObjectType /* ObjectType */, + DACL_SECURITY_INFORMATION /* SecurityInfo */, + nullptr /* ppsidOwner */, + nullptr /* ppsidGroup */, + &OldAcl /* ppDacl */, + nullptr /* ppSacl */, + nullptr /* ppSecurityDescriptor */); + if (Status != ERROR_SUCCESS) + return false; + + Status = SetEntriesInAcl(1 /* CountOfExplicitEntries */, &Access /* pListOfExplicitEntries */, OldAcl, &NewAcl); + if (Status != ERROR_SUCCESS) + return false; + + Status = SetNamedSecurityInfo(ObjectName /* ObjectName */, + ObjectType /* ObjectType */, + DACL_SECURITY_INFORMATION /*SecurityInfo */, + nullptr /* psidOwner */, + nullptr /* psidGroup */, + NewAcl /* pDacl */, + nullptr /* pSacl */); + if (NewAcl) + ::LocalFree(NewAcl); + + return Status == ERROR_SUCCESS; +} + +void +SandboxedJob::Initialize(std::string_view AppContainerId) +{ + if (m_IsInitialized) + { + return; + } + + std::wstring ContainerName = zen::Utf8ToWide(AppContainerId); + + HRESULT hRes = ::CreateAppContainerProfile(ContainerName.c_str(), + ContainerName.c_str() /* Display Name */, + ContainerName.c_str() /* Description */, + nullptr /* Capabilities */, + 0 /* Capability Count */, + &m_AppContainerSid); + + if (FAILED(hRes)) + { + hRes = ::DeriveAppContainerSidFromAppContainerName(ContainerName.c_str(), &m_AppContainerSid); + + if (FAILED(hRes)) + { + spdlog::error("Failed creating app container SID"); + } + } + + // Debugging context + + PWSTR Str = nullptr; + ::ConvertSidToStringSid(m_AppContainerSid, &Str); + + spdlog::info("AppContainer SID : '{}'", WideToUtf8(Str)); + + PWSTR Path = nullptr; + if (SUCCEEDED(::GetAppContainerFolderPath(Str, &Path))) + { + spdlog::info("AppContainer folder: '{}'", WideToUtf8(Path)); + + ::CoTaskMemFree(Path); + } + ::LocalFree(Str); + + m_IsInitialized = true; +} + +bool +SandboxedJob::SpawnJob(std::filesystem::path ExePath) +{ + // Build process attributes + + SECURITY_CAPABILITIES Sc = {0}; + Sc.AppContainerSid = m_AppContainerSid; + + STARTUPINFOEX StartupInfo = {sizeof(STARTUPINFOEX)}; + PROCESS_INFORMATION ProcessInfo{}; + SIZE_T Size = 0; + + ::InitializeProcThreadAttributeList(nullptr, 1, 0, &Size); + + auto AttrBuffer = std::make_unique<uint8_t[]>(Size); + StartupInfo.lpAttributeList = reinterpret_cast<LPPROC_THREAD_ATTRIBUTE_LIST>(AttrBuffer.get()); + + if (!::InitializeProcThreadAttributeList(StartupInfo.lpAttributeList, 1, 0, &Size)) + { + return false; + } + + if (!::UpdateProcThreadAttribute(StartupInfo.lpAttributeList, + 0, + PROC_THREAD_ATTRIBUTE_SECURITY_CAPABILITIES, + &Sc, + sizeof Sc, + nullptr, + nullptr)) + { + return false; + } + + // Set up security for files/folders/registry + + for (const std::filesystem::path& File : m_WhitelistFiles) + { + std::wstring NativeFileName = File.native(); + GrantNamedObjectAccess(NativeFileName.data(), SE_FILE_OBJECT, FILE_ALL_ACCESS, true); + } + + for (std::wstring& RegKey : m_WhitelistRegistryKeys) + { + GrantNamedObjectAccess(RegKey.data(), SE_REGISTRY_WOW64_32KEY, KEY_ALL_ACCESS, true); + } + + std::wstring ExePathNative = ExePath.native(); + std::wstring WorkingDirNative = m_WorkingDirectory.native(); + + BOOL Created = ::CreateProcess(nullptr /* ApplicationName */, + ExePathNative.data() /* Command line */, + nullptr /* Process Attributes */, + nullptr /* Security Attributes */, + FALSE /* InheritHandles */, + EXTENDED_STARTUPINFO_PRESENT | CREATE_NEW_CONSOLE /* Flags */, + nullptr /* Environment */, + WorkingDirNative.data() /* Current Directory */, + (LPSTARTUPINFO)&StartupInfo, + &ProcessInfo); + + DeleteProcThreadAttributeList(StartupInfo.lpAttributeList); + + if (!Created) + { + return false; + } + + spdlog::info("Created process {}", ProcessInfo.dwProcessId); + + return true; +} + +HttpLaunchService::HttpLaunchService(CasStore& Store) +: m_Log("exec", begin(spdlog::default_logger()->sinks()), end(spdlog::default_logger()->sinks())) +, m_CasStore(Store) +{ + m_Router.AddPattern("job", "([[:digit:]]+)"); + + m_Router.RegisterRoute( + "jobs/{job}", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + switch (HttpReq.RequestVerb()) + { + case HttpVerb::kGet: + break; + + case HttpVerb::kPost: + break; + } + }, + HttpVerb::kGet | HttpVerb::kPost); + + // Experimental + +#if 0 + m_Router.RegisterRoute( + "jobs/sandbox", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + switch (HttpReq.RequestVerb()) + { + case HttpVerb::kGet: + break; + + case HttpVerb::kPost: + { + SandboxedJob Job; + Job.Initialize("zen_test"); + Job.SetWorkingDirectory("c:\\temp\\sandbox1"); + Job.AddWhitelistFile("c:\\temp\\sandbox1"); + Job.SpawnJob("c:\\windows\\system32\\cmd.exe"); + } + break; + } + }, + HttpVerb::kGet | HttpVerb::kPost); +#endif + + m_Router.RegisterRoute( + "jobs/prep", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + switch (HttpReq.RequestVerb()) + { + case HttpVerb::kPost: + { + // This operation takes the proposed job spec and identifies which + // chunks are not present on this server. This list is then returned in + // the "need" list in the response + + IoBuffer Payload = HttpReq.ReadPayload(); + CbObject RequestObject = LoadCompactBinaryObject(Payload); + + std::vector<IoHash> NeedList; + + for (auto Entry : RequestObject["files"sv]) + { + CbObjectView Ob = Entry.AsObjectView(); + + const IoHash FileHash = Ob["hash"sv].AsHash(); + + if (!m_CasStore.FindChunk(FileHash)) + { + spdlog::debug("NEED: {} {} {}", FileHash, Ob["file"sv].AsString(), Ob["size"sv].AsUInt64()); + + NeedList.push_back(FileHash); + } + } + + CbObjectWriter Cbo; + Cbo.BeginArray("need"); + + for (const IoHash& Hash : NeedList) + { + Cbo << Hash; + } + + Cbo.EndArray(); + CbObject Response = Cbo.Save(); + + return HttpReq.WriteResponse(HttpResponse::OK, Response); + } + break; + } + }, + HttpVerb::kPost); + + m_Router.RegisterRoute( + "jobs", + [this](HttpRouterRequest& Req) { + HttpServerRequest& HttpReq = Req.ServerRequest(); + + switch (HttpReq.RequestVerb()) + { + case HttpVerb::kGet: + break; + + case HttpVerb::kPost: + { + IoBuffer Payload = HttpReq.ReadPayload(); + CbObject RequestObject = LoadCompactBinaryObject(Payload); + + bool AllOk = true; + + std::vector<IoHash> NeedList; + + // TODO: auto-generate! + std::filesystem::path SandboxDir{"c:\\temp\\sandbox1"}; + zen::DeleteDirectories(SandboxDir); + zen::CreateDirectories(SandboxDir); + + for (auto Entry : RequestObject["files"sv]) + { + CbObjectView Ob = Entry.AsObjectView(); + + std::string_view FileName = Ob["file"sv].AsString(); + const IoHash FileHash = Ob["hash"sv].AsHash(); + uint64_t FileSize = Ob["size"sv].AsUInt64(); + + if (IoBuffer Chunk = m_CasStore.FindChunk(FileHash); !Chunk) + { + spdlog::debug("MISSING: {} {} {}", FileHash, FileName, FileSize); + AllOk = false; + + NeedList.push_back(FileHash); + } + else + { + std::filesystem::path FullPath = SandboxDir / FileName; + + const IoBuffer* Chunks[] = {&Chunk}; + + zen::WriteFile(FullPath, Chunks, 1); + } + } + + if (!AllOk) + { + // TODO: Could report all the missing pieces in the response here + return HttpReq.WriteResponse(HttpResponse::NotFound); + } + + std::wstring Executable = Utf8ToWide(RequestObject["cmd"].AsString()); + std::wstring Args = Utf8ToWide(RequestObject["args"].AsString()); + + std::filesystem::path ExeName = SandboxDir / Executable; + + BasicJob Job; + Job.SetWorkingDirectory(SandboxDir); + Job.SpawnJob(ExeName, Args); + Job.Wait(); + + return HttpReq.WriteResponse(HttpResponse::OK); + } + break; + } + }, + HttpVerb::kGet | HttpVerb::kPost); +} + +HttpLaunchService::~HttpLaunchService() +{ +} + +const char* +HttpLaunchService::BaseUri() const +{ + return "/exec/"; +} + +void +HttpLaunchService::HandleRequest(HttpServerRequest& Request) +{ + if (m_Router.HandleRequest(Request) == false) + { + m_Log.warn("No route found for {0}", Request.RelativeUri()); + } +} + +} // namespace zen diff --git a/zenserver/testing/launch.h b/zenserver/testing/launch.h new file mode 100644 index 000000000..5dd946eda --- /dev/null +++ b/zenserver/testing/launch.h @@ -0,0 +1,31 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> + +#include <spdlog/spdlog.h> + +namespace zen { + +class CasStore; + +/** + * Process launcher for test executables + */ +class HttpLaunchService : public HttpService +{ +public: + HttpLaunchService(CasStore& Store); + ~HttpLaunchService(); + + virtual const char* BaseUri() const override; + virtual void HandleRequest(HttpServerRequest& Request) override; + +private: + spdlog::logger m_Log; + HttpRequestRouter m_Router; + CasStore& m_CasStore; +}; + +} // namespace zen diff --git a/zenserver/upstream/jupiter.cpp b/zenserver/upstream/jupiter.cpp new file mode 100644 index 000000000..6b54f3d01 --- /dev/null +++ b/zenserver/upstream/jupiter.cpp @@ -0,0 +1,277 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "jupiter.h" + +#include <fmt/format.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/thread.h> + +// For some reason, these don't seem to stick, so we disable the warnings +//# define _SILENCE_CXX17_C_HEADER_DEPRECATION_WARNING 1 +//# define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4004) +#pragma warning(disable : 4996) +#include <cpr/cpr.h> +#pragma warning(pop) + +#if ZEN_PLATFORM_WINDOWS +# pragma comment(lib, "Crypt32.lib") +# pragma comment(lib, "Wldap32.lib") +#endif + +#include <spdlog/spdlog.h> +#include <json11.hpp> + +using namespace std::literals; +using namespace fmt::literals; + +namespace zen { + +namespace detail { + struct CloudCacheSessionState + { + CloudCacheSessionState(CloudCacheClient& Client) : OwnerClient(Client) {} + ~CloudCacheSessionState() {} + + void Reset() + { + std::string Auth; + OwnerClient.AcquireAccessToken(Auth); + + Session.SetBody({}); + Session.SetOption(cpr::Header{{"Authorization", Auth}}); + } + + CloudCacheClient& OwnerClient; + cpr::Session Session; + }; +} // namespace detail + +CloudCacheSession::CloudCacheSession(CloudCacheClient* OuterClient) : m_CacheClient(OuterClient) +{ + m_SessionState = m_CacheClient->AllocSessionState(); +} + +CloudCacheSession::~CloudCacheSession() +{ + m_CacheClient->FreeSessionState(m_SessionState); +} + +#define TESTING_PREFIX "aaaaa" + +IoBuffer +CloudCacheSession::Get(std::string_view BucketId, std::string_view Key) +{ + ExtendableStringBuilder<256> Uri; + Uri << m_CacheClient->ServiceUrl(); + Uri << "/api/v1/c/ddc/" << m_CacheClient->Namespace() << "/" << BucketId << "/" TESTING_PREFIX << Key << ".raw"; + + auto& Session = m_SessionState->Session; + Session.SetUrl(cpr::Url{Uri.c_str()}); + + cpr::Response Response = Session.Get(); + + if (!Response.error) + { + return IoBufferBuilder::MakeCloneFromMemory(Response.text.data(), Response.text.size()); + } + + return {}; +} + +void +CloudCacheSession::Put(std::string_view BucketId, std::string_view Key, IoBuffer Data) +{ + ExtendableStringBuilder<256> Uri; + Uri << m_CacheClient->ServiceUrl(); + Uri << "/api/v1/c/ddc/" << m_CacheClient->Namespace() << "/" << BucketId << "/" TESTING_PREFIX << Key; + + auto& Session = m_SessionState->Session; + + IoHash Hash = IoHash::HashMemory(Data.Data(), Data.Size()); + + std::string Auth; + m_CacheClient->AcquireAccessToken(Auth); + Session.SetOption(cpr::Url{Uri.c_str()}); + Session.SetOption( + cpr::Header{{"Authorization", Auth}, {"X-Jupiter-IoHash", Hash.ToHexString()}, {"Content-Type", "application/octet-stream"}}); + Session.SetOption(cpr::Body{(const char*)Data.Data(), Data.Size()}); + + cpr::Response Response = Session.Put(); + + if (Response.error) + { + spdlog::warn("PUT failed: '{}'", Response.error.message); + } +} + +////////////////////////////////////////////////////////////////////////// + +std::string +CloudCacheAccessToken::GetAuthorizationHeaderValue() +{ + RwLock::SharedLockScope _(m_Lock); + + return "Bearer {}"_format(m_Token); +} + +inline void +CloudCacheAccessToken::SetToken(std::string_view Token) +{ + RwLock::ExclusiveLockScope _(m_Lock); + m_Token = Token; + ++m_Serial; +} + +////////////////////////////////////////////////////////////////////////// +// +// ServiceUrl: https://jupiter.devtools.epicgames.com +// Namespace: ue4.ddc +// OAuthClientId: 0oao91lrhqPiAlaGD0x7 +// OAuthProvider: https://epicgames.okta.com/oauth2/auso645ojjWVdRI3d0x7/v1/token +// OAuthSecret: -GBWjjenhCgOwhxL5yBKNJECVIoDPH0MK4RDuN7d +// + +CloudCacheClient::CloudCacheClient(std::string_view ServiceUrl, + std::string_view Namespace, + std::string_view OAuthProvider, + std::string_view OAuthClientId, + std::string_view OAuthSecret) +: m_ServiceUrl(ServiceUrl) +, m_OAuthFullUri(OAuthProvider) +, m_Namespace(Namespace) +, m_DefaultBucket("default") +, m_OAuthClientId(OAuthClientId) +, m_OAuthSecret(OAuthSecret) +{ + if (!OAuthProvider.starts_with("http://"sv) && !OAuthProvider.starts_with("https://"sv)) + { + spdlog::warn("bad provider specification: '{}' - must be fully qualified"_format(OAuthProvider).c_str()); + m_IsValid = false; + + return; + } + + // Split into host and Uri substrings + + auto SchemePos = OAuthProvider.find("://"sv); + + if (SchemePos == std::string::npos) + { + spdlog::warn("Bad service URL passed to cloud cache client: '{}'", ServiceUrl); + m_IsValid = false; + + return; + } + + auto DomainEnd = OAuthProvider.find('/', /* also skip the :// */ SchemePos + 3); + + if (DomainEnd == std::string::npos) + { + spdlog::warn("Bad service URL passed to cloud cache client: '{}' no path delimiter found", ServiceUrl); + m_IsValid = false; + + return; + } + + m_OAuthDomain = OAuthProvider.substr(SchemePos + 3, DomainEnd - SchemePos - 3); // epicgames.okta.com + m_OAuthUriPath = OAuthProvider.substr(DomainEnd + 1); // oauth2/..../v1/token +} + +CloudCacheClient::~CloudCacheClient() +{ + RwLock::ExclusiveLockScope _(m_SessionStateLock); + + for (auto State : m_SessionStateCache) + { + delete State; + } +} + +bool +CloudCacheClient::AcquireAccessToken(std::string& AuthorizationHeaderValue) +{ + // TODO: check for expiration + + if (!m_IsValid) + { + ExtendableStringBuilder<128> OAuthFormData; + OAuthFormData << "client_id=" << m_OAuthClientId + << "&scope=cache_access&grant_type=client_credentials&client_secret=" << m_OAuthSecret; + + const uint32_t CurrentSerial = m_AccessToken.GetSerial(); + + static RwLock AuthMutex; + RwLock::ExclusiveLockScope _(AuthMutex); + + // Protect against redundant authentication operations + if (m_AccessToken.GetSerial() != CurrentSerial) + { + // TODO: this could verify that the token is actually valid and retry if not? + + return true; + } + + std::string data{OAuthFormData}; + + cpr::Response Response = + cpr::Post(cpr::Url{m_OAuthFullUri}, cpr::Header{{"Content-Type", "application/x-www-form-urlencoded"}}, cpr::Body{data}); + + std::string Body{std::move(Response.text)}; + + // Parse JSON response + + std::string JsonError; + json11::Json JsonResponse = json11::Json::parse(Body, /* out */ JsonError); + if (!JsonError.empty()) + { + spdlog::warn("failed to parse OAuth response: '{}'", JsonError); + + return false; + } + + std::string AccessToken = JsonResponse["access_token"].string_value(); + int ExpiryTimeSeconds = JsonResponse["expires_in"].int_value(); + + m_AccessToken.SetToken(AccessToken); + + m_IsValid = true; + } + + AuthorizationHeaderValue = m_AccessToken.GetAuthorizationHeaderValue(); + + return true; +} + +detail::CloudCacheSessionState* +CloudCacheClient::AllocSessionState() +{ + detail::CloudCacheSessionState* State = nullptr; + + if (RwLock::ExclusiveLockScope _(m_SessionStateLock); !m_SessionStateCache.empty()) + { + State = m_SessionStateCache.front(); + m_SessionStateCache.pop_front(); + } + + if (State == nullptr) + { + State = new detail::CloudCacheSessionState(*this); + } + + State->Reset(); + + return State; +} + +void +CloudCacheClient::FreeSessionState(detail::CloudCacheSessionState* State) +{ + RwLock::ExclusiveLockScope _(m_SessionStateLock); + m_SessionStateCache.push_front(State); +} + +} // namespace zen diff --git a/zenserver/upstream/jupiter.h b/zenserver/upstream/jupiter.h new file mode 100644 index 000000000..dd01cfb86 --- /dev/null +++ b/zenserver/upstream/jupiter.h @@ -0,0 +1,97 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/refcount.h> +#include <zencore/thread.h> + +#include <atomic> +#include <list> +#include <memory> + +namespace zen { +namespace detail { + struct CloudCacheSessionState; +} + +class IoBuffer; +class CloudCacheClient; +struct IoHash; + +/** + * Cached access token, for use with `Authorization:` header + */ +struct CloudCacheAccessToken +{ + std::string GetAuthorizationHeaderValue(); + void SetToken(std::string_view Token); + + inline uint32_t GetSerial() const { return m_Serial.load(std::memory_order::memory_order_relaxed); } + +private: + RwLock m_Lock; + std::string m_Token; + std::atomic<uint32_t> m_Serial; +}; + +/** + * Context for performing Jupiter operations + * + * Maintains an HTTP connection so that subsequent operations don't need to go + * through the whole connection setup process + * + */ +class CloudCacheSession +{ +public: + CloudCacheSession(CloudCacheClient* OuterClient); + ~CloudCacheSession(); + + IoBuffer Get(std::string_view BucketId, std::string_view Key); + void Put(std::string_view BucketId, std::string_view Key, IoBuffer Data); + +private: + RefPtr<CloudCacheClient> m_CacheClient; + detail::CloudCacheSessionState* m_SessionState; +}; + +/** + * Jupiter upstream cache client + */ +class CloudCacheClient : public RefCounted +{ +public: + CloudCacheClient(std::string_view ServiceUrl, + std::string_view Namespace, + std::string_view OAuthProvider, + std::string_view OAuthClientId, + std::string_view OAuthSecret); + ~CloudCacheClient(); + + bool AcquireAccessToken(std::string& AuthorizationHeaderValue); + std::string_view Namespace() const { return m_Namespace; } + std::string_view DefaultBucket() const { return m_DefaultBucket; } + std::string_view ServiceUrl() const { return m_ServiceUrl; } + +private: + bool m_IsValid = false; + std::string m_ServiceUrl; + std::string m_OAuthDomain; + std::string m_OAuthUriPath; + std::string m_OAuthFullUri; + std::string m_Namespace; + std::string m_DefaultBucket; + std::string m_OAuthClientId; + std::string m_OAuthSecret; + CloudCacheAccessToken m_AccessToken; + + RwLock m_SessionStateLock; + std::list<detail::CloudCacheSessionState*> m_SessionStateCache; + + detail::CloudCacheSessionState* AllocSessionState(); + void FreeSessionState(detail::CloudCacheSessionState*); + + friend class CloudCacheSession; +}; + +} // namespace zen diff --git a/zenserver/upstream/zen.cpp b/zenserver/upstream/zen.cpp new file mode 100644 index 000000000..7148715f2 --- /dev/null +++ b/zenserver/upstream/zen.cpp @@ -0,0 +1,291 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zen.h" + +#include <zencore/compactbinarybuilder.h> +#include <zencore/compactbinaryvalidation.h> +#include <zencore/fmtutils.h> +#include <zencore/stream.h> + +#include <spdlog/spdlog.h> +#include <xxhash.h> +#include <gsl/gsl-lite.hpp> + +namespace zen { + +namespace detail { + struct MessageHeader + { + static const uint32_t kMagic = 0x11'99'77'22; + + uint32_t Magic = kMagic; + uint32_t Checksum = 0; + uint16_t MessageSize = 0; // Size *including* this field and the reserved field + uint16_t Reserved = 0; + + void SetPayload(const void* PayloadData, uint64_t PayloadSize) + { + memcpy(Payload(), PayloadData, PayloadSize); + MessageSize = gsl::narrow<uint16_t>(PayloadSize + sizeof MessageSize + sizeof Reserved); + Checksum = ComputeChecksum(); + } + + inline CbObject GetMessage() const + { + if (IsOk()) + { + MemoryView MessageView(Payload(), MessageSize - sizeof MessageSize - sizeof Reserved); + + CbValidateError ValidationResult = ValidateCompactBinary(MessageView, CbValidateMode::All); + + if (ValidationResult == CbValidateError::None) + { + return CbObject{SharedBuffer::MakeView(MessageView)}; + } + } + + return {}; + } + + uint32_t TotalSize() const { return MessageSize + sizeof Checksum + sizeof Magic; } + uint32_t ComputeChecksum() const { return gsl::narrow_cast<uint32_t>(XXH3_64bits(&MessageSize, MessageSize)); } + inline bool IsOk() const { return Magic == kMagic && Checksum == ComputeChecksum(); } + + private: + inline void* Payload() { return &Reserved + 1; } + inline const void* Payload() const { return &Reserved + 1; } + }; +} // namespace detail + +// Note that currently this just implements an UDP echo service for testing purposes + +Mesh::Mesh(asio::io_context& IoContext) : m_IoContext(IoContext) +{ +} + +Mesh::~Mesh() +{ + Stop(); +} + +void +Mesh::Start(uint16_t Port) +{ + ZEN_ASSERT(Port); + ZEN_ASSERT(m_Port == 0); + + m_Port = Port; + m_UdpSocket = std::make_unique<asio::ip::udp::socket>(m_IoContext, asio::ip::udp::endpoint(asio::ip::udp::v4(), m_Port)); + m_Thread = std::make_unique<std::thread>([this] { Run(); }); +}; + +void +Mesh::Stop() +{ + using namespace std::literals; + + if (!m_Port) + { + // Never started, nothing to do here + return; + } + + CbObjectWriter Msg; + Msg << "bye"sv << m_SessionId; + BroadcastPacket(Msg); + + m_State = kExiting; + + std::error_code Ec; + m_Timer.cancel(Ec); + + m_UdpSocket->close(Ec); + + m_IoContext.stop(); + + if (m_Thread) + { + m_Thread->join(); + m_Thread.reset(); + } +} + +void +Mesh::EnqueueTick() +{ + m_Timer.expires_after(std::chrono::seconds(10)); + + m_Timer.async_wait([&](const std::error_code& Ec) { + if (!Ec) + { + OnTick(); + } + else + { + if (m_State != kExiting) + { + spdlog::warn("Mesh timer error: {}", Ec.message()); + } + } + }); +} + +void +Mesh::OnTick() +{ + using namespace std::literals; + + CbObjectWriter Msg; + + // Basic service information + + Msg.BeginArray("s"); + Msg << m_SessionId << m_Port << /* event sequence # */ uint32_t(0); + Msg.EndArray(); + + BroadcastPacket(Msg); + + EnqueueTick(); +} + +void +Mesh::BroadcastPacket(CbObjectWriter& Obj) +{ + std::error_code ErrorCode; + + asio::ip::udp::socket BroadcastSocket(m_IoContext); + BroadcastSocket.open(asio::ip::udp::v4(), ErrorCode); + + if (!ErrorCode) + { + BroadcastSocket.set_option(asio::ip::udp::socket::reuse_address(true)); + BroadcastSocket.set_option(asio::socket_base::broadcast(true)); + + asio::ip::udp::endpoint BroadcastEndpoint(asio::ip::address_v4::broadcast(), m_Port); + + uint8_t MessageBuffer[kMaxMessageSize]; + detail::MessageHeader* Message = reinterpret_cast<detail::MessageHeader*>(MessageBuffer); + *Message = {}; + + MemoryOutStream MemOut; + BinaryWriter Writer(MemOut); + + Obj.Save(Writer); + + // TODO: check that it fits in a packet! + + Message->SetPayload(MemOut.Data(), MemOut.Size()); + + BroadcastSocket.send_to(asio::buffer(Message, Message->TotalSize()), BroadcastEndpoint); + BroadcastSocket.close(); + } + else + { + spdlog::warn("failed to open broadcast socket: {}", ErrorCode.message()); + } +} + +void +Mesh::Run() +{ + m_State = kRunning; + + EnqueueTick(); + + IssueReceive(); + m_IoContext.run(); +} + +void +Mesh::IssueReceive() +{ + using namespace std::literals; + + m_UdpSocket->async_receive_from( + asio::buffer(m_MessageBuffer, sizeof m_MessageBuffer), + m_SenderEndpoint, + [this](std::error_code ec, size_t BytesReceived) { + if (!ec && BytesReceived) + { + std::error_code ErrorCode; + std::string Sender = m_SenderEndpoint.address().to_string(ErrorCode); + + // Process message + + uint32_t& Magic = *reinterpret_cast<uint32_t*>(m_MessageBuffer); + + switch (Magic) + { + case detail::MessageHeader::kMagic: + { + detail::MessageHeader& Header = *reinterpret_cast<detail::MessageHeader*>(m_MessageBuffer); + + if (CbObject Msg = Header.GetMessage()) + { + const asio::ip::address& Ip = m_SenderEndpoint.address(); + + if (auto Field = Msg["s"sv]) + { + // Announce + + CbArrayView Ci = Field.AsArrayView(); + auto It = Ci.CreateViewIterator(); + + const Oid SessionId = It->AsObjectId(); + + if (SessionId != Oid::Zero && SessionId != m_SessionId) + { + const uint16_t Port = (++It)->AsUInt16(m_SenderEndpoint.port()); + const uint32_t Lsn = (++It)->AsUInt32(); + + spdlog::info("received hey from {} ({})", Sender, SessionId); + + RwLock::ExclusiveLockScope _(m_SessionsLock); + + PeerInfo& Info = m_KnownPeers[SessionId]; + + Info.LastSeen = std::time(nullptr); + Info.SessionId = SessionId; + + if (std::find(begin(Info.SeenOnIP), end(Info.SeenOnIP), Ip) == Info.SeenOnIP.end()) + { + Info.SeenOnIP.push_back(Ip); + } + } + } + else if (auto Bye = Msg["bye"sv]) + { + Oid SessionId = Field.AsObjectId(); + + spdlog::info("received bye from {} ({})", Sender, SessionId); + + // We could verify that it's sent from a known IP before erasing the + // session, if we want to be paranoid + + RwLock::ExclusiveLockScope _(m_SessionsLock); + + m_KnownPeers.erase(SessionId); + } + else + { + // Unknown message type, just ignore + } + } + else + { + spdlog::warn("received malformed message from {}", Sender); + } + } + break; + + default: + spdlog::warn("received malformed data from {}", Sender); + break; + } + + IssueReceive(); + } + }); +} + +} // namespace zen diff --git a/zenserver/upstream/zen.h b/zenserver/upstream/zen.h new file mode 100644 index 000000000..75e29bf86 --- /dev/null +++ b/zenserver/upstream/zen.h @@ -0,0 +1,84 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/memory.h> +#include <zencore/thread.h> +#include <zencore/uid.h> +#include <zencore/zencore.h> + +#pragma warning(push) +#pragma warning(disable : 4127) +#include <tsl/robin_map.h> +#pragma warning(pop) + +#include <asio.hpp> + +#include <chrono> + +namespace zen { + +class CbObjectWriter; + +/** Zen mesh tracker + * + * Discovers and tracks local peers + */ +class Mesh +{ +public: + Mesh(asio::io_context& IoContext); + ~Mesh(); + + void Start(uint16_t Port); + void Stop(); + +private: + void Run(); + void IssueReceive(); + void EnqueueTick(); + void OnTick(); + void BroadcastPacket(CbObjectWriter&); + + enum State + { + kInitializing, + kRunning, + kExiting + }; + + static const int kMaxMessageSize = 2048; + static const int kMaxUpdateSize = 1400; // We'll try not to send messages larger than this + + std::atomic<State> m_State = kInitializing; + asio::io_context& m_IoContext; + std::unique_ptr<asio::ip::udp::socket> m_UdpSocket; + std::unique_ptr<asio::ip::udp::socket> m_BroadcastSocket; + asio::ip::udp::endpoint m_SenderEndpoint; + std::unique_ptr<std::thread> m_Thread; + uint16_t m_Port = 0; + uint8_t m_MessageBuffer[kMaxMessageSize]; + asio::high_resolution_timer m_Timer{m_IoContext}; + Oid m_SessionId{Oid::NewOid()}; + + struct PeerInfo + { + Oid SessionId; + std::time_t LastSeen; + std::vector<asio::ip::address> SeenOnIP; + }; + + RwLock m_SessionsLock; + tsl::robin_map<Oid, PeerInfo, Oid::Hasher> m_KnownPeers; +}; + +class ZenKvCacheClient +{ +public: + ZenKvCacheClient(); + ~ZenKvCacheClient(); + +private: +}; + +} // namespace zen diff --git a/zenserver/vfs.cpp b/zenserver/vfs.cpp new file mode 100644 index 000000000..71f0bbdda --- /dev/null +++ b/zenserver/vfs.cpp @@ -0,0 +1,898 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "vfs.h" + +#include <zencore/except.h> +#include <zencore/filesystem.h> +#include <zencore/snapshot_manifest.h> +#include <zencore/stream.h> +#include <zencore/windows.h> + +#include <map> + +#include <atlfile.h> +#include <projectedfslib.h> +#include <spdlog/spdlog.h> + +#pragma comment(lib, "projectedfslib.lib") + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +struct ProjFsCliOptions +{ + bool IsDebug = false; + bool IsClean = false; + std::string CasSpec; + std::string ManifestSpec; + std::string MountPoint; +}; + +struct GuidHasher +{ + size_t operator()(const GUID& Guid) const + { + static_assert(sizeof(GUID) == (sizeof(size_t) * 2)); + + const size_t* Ptr = reinterpret_cast<const size_t*>(&Guid); + + return Ptr[0] ^ Ptr[1]; + } +}; + +class ProjfsNamespace +{ +public: + HRESULT Initialize(const char* SnapshotSpec, const char* CasSpec) + { + std::filesystem::path ManifestSpec = zen::ManifestSpecToPath(SnapshotSpec); + + CAtlFile ManifestFile; + HRESULT hRes = ManifestFile.Create(ManifestSpec.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + if (FAILED(hRes)) + { + spdlog::error("MANIFEST NOT FOUND!"); // TODO: add context + + return hRes; + } + + ULONGLONG FileLength = 0; + ManifestFile.GetSize(FileLength); + + std::vector<uint8_t> Data; + Data.resize(FileLength); + + ManifestFile.Read(Data.data(), (DWORD)Data.size()); + + zen::MemoryInStream MemoryStream(Data.data(), Data.size()); + + ReadManifest(/* out */ m_Manifest, MemoryStream); + + uint64_t TotalBytes = 0; + uint64_t TotalFiles = 0; + + m_Manifest.Root.VisitFiles([&](const zen::LeafNode& Node) { + TotalBytes += Node.FileSize; + TotalFiles++; + }); + + m_FileByteCount = TotalBytes; + m_FileCount = TotalFiles; + + // CAS root + + zen::CasStoreConfiguration Config; + Config.RootDirectory = CasSpec; + m_CasStore->Initialize(Config); + + return S_OK; + } + + struct LookupResult + { + const zen::TreeNode* TreeNode = nullptr; + const zen::LeafNode* LeafNode = nullptr; + }; + + bool IsOnCasDrive(const char* Path) + { + ZEN_UNUSED(Path); + + // TODO: programmatically determine of CAS and workspace path is on same drive! + return true; + } + + LookupResult LookupNode(const std::wstring& Name) const + { + if (Name.empty()) + return {nullptr}; + + zen::ExtendableWideStringBuilder<MAX_PATH> LocalName; + LocalName.Append(Name.c_str()); + + // Split components + + const wchar_t* PathComponents[MAX_PATH / 2]; + size_t PathComponentCount = 0; + + const size_t Length = Name.length(); + + wchar_t* Base = LocalName.Data(); + wchar_t* itStart = Base; + + for (int i = 0; i < Length; ++i) + { + if (Base[i] == '\\') + { + // Component separator + + Base[i] = L'\0'; + + PathComponents[PathComponentCount++] = itStart; + + itStart = Base + i + 1; + } + } + + // Push final component + if (Name.back() != L'\\') + PathComponents[PathComponentCount++] = itStart; + + const zen::TreeNode* Node = &m_Manifest.Root; + + if (PathComponentCount == 1) + { + if (PrjFileNameCompare(L"root", Name.c_str()) == 0) + return {Node}; + else + return {nullptr}; + } + + for (size_t i = 1; i < PathComponentCount; ++i) + { + const auto& part = PathComponents[i]; + + const zen::TreeNode* NextNode = nullptr; + + for (const zen::TreeNode& ChildNode : Node->Children) + { + if (PrjFileNameCompare(part, ChildNode.Name.c_str()) == 0) + { + NextNode = &ChildNode; + break; + } + } + + if (NextNode) + { + Node = NextNode; + + continue; + } + + if (i == PathComponentCount - 1) + { + for (const zen::LeafNode& Leaf : Node->Leaves) + { + if (PrjFileNameCompare(part, Leaf.Name.c_str()) == 0) + return {nullptr, &Leaf}; + } + } + + return {nullptr}; + } + + return {Node}; + } + + const zen::SnapshotManifest& Manifest() const { return m_Manifest; } + zen::CasStore& CasStore() { return *m_CasStore; } + + uint64_t FileCount() const { return m_FileCount; } + uint64_t FileByteCount() const { return m_FileByteCount; } + +private: + zen::SnapshotManifest m_Manifest; + std::unique_ptr<zen::CasStore> m_CasStore; + + size_t m_FileCount = 0; + size_t m_FileByteCount = 0; +}; + +/** Projected File System Provider + */ + +class ProjfsProvider +{ +public: + HRESULT ReadManifest(const char* ManifestSpec, const char* CasSpec); + HRESULT Initialize(std::filesystem::path RootPath, bool Clean); + void Cleanup(); + + struct Callbacks; + +private: + static void DebugPrint(const char* Format, ...); + + HRESULT StartDirEnum(const PRJ_CALLBACK_DATA* CallbackData, LPCGUID EnumerationId); + HRESULT EndDirEnum(const PRJ_CALLBACK_DATA* CallbackData, LPCGUID EnumerationId); + HRESULT GetDirEnum(const PRJ_CALLBACK_DATA* CallbackData, + LPCGUID EnumerationId, + LPCWSTR SearchExpression, + PRJ_DIR_ENTRY_BUFFER_HANDLE DirEntryBufferHandle); + HRESULT GetPlaceholderInformation(const PRJ_CALLBACK_DATA* CallbackData); + HRESULT GetFileStream(const PRJ_CALLBACK_DATA* CallbackData, UINT64 ByteOffset, UINT32 Length); + HRESULT QueryFileName(const PRJ_CALLBACK_DATA* CallbackData); + HRESULT NotifyOperation(const PRJ_CALLBACK_DATA* CallbackData, + BOOLEAN IsDirectory, + PRJ_NOTIFICATION NotificationType, + LPCWSTR DestinationFileName, + PRJ_NOTIFICATION_PARAMETERS* OperationParameters); + void CancelCommand(const PRJ_CALLBACK_DATA* CallbackData); + + class DirectoryEnumeration; + + zen::RwLock m_Lock; + std::unordered_map<GUID, std::unique_ptr<DirectoryEnumeration>, GuidHasher> m_DirectoryEnumerators; + ProjfsNamespace m_Namespace; + PRJ_NAMESPACE_VIRTUALIZATION_CONTEXT m_PrjContext = nullptr; + bool m_GenerateFullFiles = false; +}; + +class ProjfsProvider::DirectoryEnumeration +{ +public: + DirectoryEnumeration(ProjfsProvider* Outer, LPCGUID EnumerationGuid, const wchar_t* RelativePath) + : m_Outer(Outer) + , m_EnumerationId(*EnumerationGuid) + , m_Path(RelativePath) + { + ResetScan(); + } + + ~DirectoryEnumeration() {} + + void ResetScan() + { + // Restart enumeration from beginning + + m_InfoIterator = m_Infos.end(); + + const ProjfsNamespace::LookupResult Lookup = m_Outer->m_Namespace.LookupNode(m_Path); + + if (Lookup.TreeNode == nullptr && Lookup.LeafNode == nullptr) + return; + + if (Lookup.TreeNode) + { + const zen::TreeNode* RootNode = Lookup.TreeNode; + + // Populate info array + + FILETIME FileTime; + GetSystemTimeAsFileTime(&FileTime); + + for (const zen::TreeNode& ChildNode : RootNode->Children) + { + PRJ_FILE_BASIC_INFO Fbi{0}; + + Fbi.IsDirectory = TRUE; + Fbi.FileSize = 0; + Fbi.CreationTime = Fbi.LastAccessTime = Fbi.LastWriteTime = Fbi.ChangeTime = *((LARGE_INTEGER*)&FileTime); + Fbi.FileAttributes = FILE_ATTRIBUTE_DIRECTORY; + + m_Infos.insert({ChildNode.Name, Fbi}); + } + + for (const zen::LeafNode& Leaf : RootNode->Leaves) + { + PRJ_FILE_BASIC_INFO Fbi{0}; + + Fbi.IsDirectory = FALSE; + Fbi.FileSize = Leaf.FileSize; + Fbi.FileAttributes = FILE_ATTRIBUTE_NORMAL; + Fbi.CreationTime = Fbi.LastAccessTime = Fbi.LastWriteTime = Fbi.ChangeTime = + *reinterpret_cast<const LARGE_INTEGER*>(&Leaf.FileModifiedTime); + + m_Infos.insert({Leaf.Name, Fbi}); + } + } + + m_InfoIterator = m_Infos.begin(); + } + + HRESULT HandleRequest(_In_ const PRJ_CALLBACK_DATA* CallbackData, + _In_opt_z_ LPCWSTR SearchExpression, + _In_ PRJ_DIR_ENTRY_BUFFER_HANDLE DirEntryBufferHandle) + { + int EnumLimit = INT_MAX; + + DebugPrint("ENUM '%S' -> pattern %S\n", CallbackData->FilePathName, SearchExpression); + + HRESULT hRes = S_OK; + + if (CallbackData->Flags & PRJ_CB_DATA_FLAG_ENUM_RESTART_SCAN) + ResetScan(); + + if (m_InfoIterator == m_Infos.end()) + return S_OK; + + if (CallbackData->Flags & PRJ_CB_DATA_FLAG_ENUM_RETURN_SINGLE_ENTRY) + EnumLimit = 1; + + if (!m_Predicate) + { + if (SearchExpression) + { + bool isWild = PrjDoesNameContainWildCards(SearchExpression); + + if (isWild) + { + if (SearchExpression[0] == L'*' && SearchExpression[1] == L'\0') + { + // Trivial accept -- no need to change predicate from the default + } + else + { + m_SearchExpression = SearchExpression; + + m_Predicate = [this](LPCWSTR name) { return PrjFileNameMatch(name, m_SearchExpression.c_str()); }; + } + } + else + { + if (SearchExpression[0]) + { + // Look for specific name match (does this ever happen?) + + m_SearchExpression = SearchExpression; + + m_Predicate = [this](LPCWSTR name) { return PrjFileNameCompare(name, m_SearchExpression.c_str()) == 0; }; + } + } + } + } + + if (!m_Predicate) + m_Predicate = [](LPCWSTR) { return true; }; + + while (EnumLimit && m_InfoIterator != m_Infos.end()) + { + auto& ThisNode = *m_InfoIterator; + + auto& Name = ThisNode.first; + auto& Info = ThisNode.second; + + if (m_Predicate(Name.c_str())) + { + hRes = PrjFillDirEntryBuffer(Name.c_str(), &Info, DirEntryBufferHandle); + + if (hRes == HRESULT_FROM_WIN32(ERROR_INSUFFICIENT_BUFFER)) + return S_OK; + + if (FAILED(hRes)) + break; + + --EnumLimit; + } + + ++m_InfoIterator; + } + + return hRes; + } + +private: + ProjfsProvider* m_Outer = nullptr; + const std::wstring m_Path; + const GUID m_EnumerationId; + + // We need to maintain an ordered list of directory items since the + // ProjFS enumeration code gets confused otherwise and ends up producing + // multiple entries for the same file if there's a 'hydrated' version + // present. + + struct FilenameLess + { + bool operator()(const std::wstring& Lhs, const std::wstring& Rhs) const { return PrjFileNameCompare(Lhs.c_str(), Rhs.c_str()) < 0; } + }; + + typedef std::map<std::wstring, PRJ_FILE_BASIC_INFO, FilenameLess> FileInfoMap_t; + + FileInfoMap_t m_Infos; + FileInfoMap_t::iterator m_InfoIterator; + + std::wstring m_SearchExpression; + std::function<bool(LPCWSTR name)> m_Predicate; +}; + +////////////////////////////////////////////////////////////////////////// +// Callback forwarding functions +// + +struct ProjfsProvider::Callbacks +{ + static HRESULT CALLBACK StartDirEnum(_In_ const PRJ_CALLBACK_DATA* CallbackData, _In_ const GUID* EnumerationId) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->StartDirEnum(CallbackData, EnumerationId); + } + + static HRESULT CALLBACK EndDirEnum(_In_ const PRJ_CALLBACK_DATA* CallbackData, _In_ LPCGUID EnumerationId) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->EndDirEnum(CallbackData, EnumerationId); + } + + static HRESULT CALLBACK GetDirEnum(_In_ const PRJ_CALLBACK_DATA* CallbackData, + _In_ LPCGUID EnumerationId, + _In_opt_z_ LPCWSTR SearchExpression, + _In_ PRJ_DIR_ENTRY_BUFFER_HANDLE DirEntryBufferHandle) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext) + ->GetDirEnum(CallbackData, EnumerationId, SearchExpression, DirEntryBufferHandle); + } + + static HRESULT CALLBACK GetPlaceholderInformation(_In_ const PRJ_CALLBACK_DATA* CallbackData) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->GetPlaceholderInformation(CallbackData); + } + + static HRESULT CALLBACK GetFileStream(_In_ const PRJ_CALLBACK_DATA* CallbackData, _In_ UINT64 ByteOffset, _In_ UINT32 Length) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->GetFileStream(CallbackData, ByteOffset, Length); + } + + static HRESULT CALLBACK QueryFileName(_In_ const PRJ_CALLBACK_DATA* CallbackData) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->QueryFileName(CallbackData); + } + + static HRESULT CALLBACK NotifyOperation(_In_ const PRJ_CALLBACK_DATA* CallbackData, + _In_ BOOLEAN IsDirectory, + _In_ PRJ_NOTIFICATION NotificationType, + _In_opt_ LPCWSTR DestinationFileName, + _Inout_ PRJ_NOTIFICATION_PARAMETERS* OperationParameters) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext) + ->NotifyOperation(CallbackData, IsDirectory, NotificationType, DestinationFileName, OperationParameters); + } + + static VOID CALLBACK CancelCommand(_In_ const PRJ_CALLBACK_DATA* CallbackData) + { + return reinterpret_cast<ProjfsProvider*>(CallbackData->InstanceContext)->CancelCommand(CallbackData); + } +}; + +// {6EEB94E4-3EF3-4C1C-AF15-D7FF64C19A4F} +static const GUID ProviderGuid = {0x6eeb94e4, 0x3ef3, 0x4c1c, {0xaf, 0x15, 0xd7, 0xff, 0x64, 0xc1, 0x9a, 0x4f}}; + +void +ProjfsProvider::DebugPrint(const char* FmtString, ...) +{ + va_list vl; + va_start(vl, FmtString); + +#if 0 + vprintf(FmtString, vl); +#endif + + va_end(vl); +} + +HRESULT +ProjfsProvider::Initialize(std::filesystem::path RootPath, bool Clean) +{ + PRJ_PLACEHOLDER_VERSION_INFO Pvi = {}; + Pvi.ContentID[0] = 1; + + if (Clean && std::filesystem::exists(RootPath)) + { + printf("Cleaning '%S'...", RootPath.c_str()); + + bool success = zen::DeleteDirectories(RootPath); + + if (!success) + { + printf(" retrying..."); + + success = zen::DeleteDirectories(RootPath); + + // Failed? + } + + printf(" done!\n"); + } + + bool RootDirectoryCreated = false; + +retry: + if (!std::filesystem::exists(RootPath)) + { + zen::CreateDirectories(RootPath); + } + + { + HRESULT hRes = PrjMarkDirectoryAsPlaceholder(RootPath.c_str(), nullptr, &Pvi, &ProviderGuid); + + if (FAILED(hRes)) + { + if (hRes == HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND) && !RootDirectoryCreated) + { + printf("Creating '%S'...", RootPath.c_str()); + + std::filesystem::create_directories(RootPath.c_str()); + + RootDirectoryCreated = true; + + printf("done!\n"); + + goto retry; + } + else if (hRes == HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND)) + { + throw zen::WindowsException(hRes, "Failed to initialize root placeholder"); + } + + // Ignore error, problems will be reported below anyway + } + } + + // Callbacks + + PRJ_CALLBACKS cbs = {}; + + cbs.StartDirectoryEnumerationCallback = Callbacks::StartDirEnum; + cbs.EndDirectoryEnumerationCallback = Callbacks::EndDirEnum; + cbs.GetDirectoryEnumerationCallback = Callbacks::GetDirEnum; + cbs.GetPlaceholderInfoCallback = Callbacks::GetPlaceholderInformation; + cbs.GetFileDataCallback = Callbacks::GetFileStream; + cbs.QueryFileNameCallback = Callbacks::QueryFileName; + cbs.NotificationCallback = Callbacks::NotifyOperation; + cbs.CancelCommandCallback = Callbacks::CancelCommand; + + // Parameters + + const PRJ_NOTIFY_TYPES dwNotifications = PRJ_NOTIFY_FILE_OPENED | PRJ_NOTIFY_NEW_FILE_CREATED | PRJ_NOTIFY_FILE_OVERWRITTEN | + PRJ_NOTIFY_PRE_DELETE | PRJ_NOTIFY_PRE_RENAME | PRJ_NOTIFY_PRE_SET_HARDLINK | + PRJ_NOTIFY_FILE_RENAMED | PRJ_NOTIFY_HARDLINK_CREATED | + PRJ_NOTIFY_FILE_HANDLE_CLOSED_NO_MODIFICATION | PRJ_NOTIFY_FILE_HANDLE_CLOSED_FILE_MODIFIED | + PRJ_NOTIFY_FILE_HANDLE_CLOSED_FILE_DELETED | PRJ_NOTIFY_FILE_PRE_CONVERT_TO_FULL; + + PRJ_NOTIFICATION_MAPPING Mappings[] = {{dwNotifications, L"root"}}; + + PRJ_STARTVIRTUALIZING_OPTIONS SvOptions = {}; + + SvOptions.Flags = PRJ_FLAG_NONE; + SvOptions.PoolThreadCount = 8; + SvOptions.ConcurrentThreadCount = 8; + SvOptions.NotificationMappings = Mappings; + SvOptions.NotificationMappingsCount = 1; + + HRESULT hRes = PrjStartVirtualizing(RootPath.c_str(), &cbs, this, &SvOptions, &m_PrjContext); + + if (SUCCEEDED(hRes)) + { + // Create dummy 'root' directory for now until I figure out how to + // invalidate entire trees (ProjFS won't allow invalidation of the + // entire provider tree). + + PRJ_PLACEHOLDER_INFO pli{}; + pli.FileBasicInfo.IsDirectory = TRUE; + pli.FileBasicInfo.FileAttributes = FILE_ATTRIBUTE_DIRECTORY; + pli.VersionInfo = Pvi; + + hRes = PrjWritePlaceholderInfo(m_PrjContext, L"root", &pli, sizeof pli); + } + + if (SUCCEEDED(hRes)) + { + spdlog::info("Successfully mounted snapshot at '{}'!", WideToUtf8(RootPath.c_str())); + } + else + { + spdlog::info("Failed mounting snapshot at '{}'!", WideToUtf8(RootPath.c_str())); + } + + return hRes; +} + +void +ProjfsProvider::Cleanup() +{ + PrjStopVirtualizing(m_PrjContext); +} + +HRESULT +ProjfsProvider::ReadManifest(const char* ManifestSpec, const char* CasSpec) +{ + printf("Initializing from manifest '%s'\n", ManifestSpec); + + m_Namespace.Initialize(ManifestSpec, CasSpec); + + return S_OK; +} + +HRESULT +ProjfsProvider::StartDirEnum(const PRJ_CALLBACK_DATA* CallbackData, LPCGUID EnumerationId) +{ + zen::RwLock::ExclusiveLockScope _(m_Lock); + + m_DirectoryEnumerators[*EnumerationId] = std::make_unique<DirectoryEnumeration>(this, EnumerationId, CallbackData->FilePathName); + + return S_OK; +} + +HRESULT +ProjfsProvider::EndDirEnum(const PRJ_CALLBACK_DATA* CallbackData, LPCGUID EnumerationId) +{ + ZEN_UNUSED(CallbackData); + ZEN_UNUSED(EnumerationId); + + zen::RwLock::ExclusiveLockScope _(m_Lock); + + m_DirectoryEnumerators.erase(*EnumerationId); + + return S_OK; +} + +HRESULT +ProjfsProvider::GetDirEnum(const PRJ_CALLBACK_DATA* CallbackData, + LPCGUID EnumerationId, + LPCWSTR SearchExpression, + PRJ_DIR_ENTRY_BUFFER_HANDLE DirEntryBufferHandle) +{ + DirectoryEnumeration* directoryEnumerator; + + { + zen::RwLock::SharedLockScope _(m_Lock); + + auto it = m_DirectoryEnumerators.find(*EnumerationId); + + if (it == m_DirectoryEnumerators.end()) + return E_FAIL; // No enumerator associated with specified GUID + + directoryEnumerator = (*it).second.get(); + } + + return directoryEnumerator->HandleRequest(CallbackData, SearchExpression, DirEntryBufferHandle); +} + +HRESULT +ProjfsProvider::GetPlaceholderInformation(const PRJ_CALLBACK_DATA* CallbackData) +{ + ProjfsNamespace::LookupResult result = m_Namespace.LookupNode(CallbackData->FilePathName); + + if (auto Leaf = result.LeafNode) + { + PRJ_PLACEHOLDER_INFO PlaceholderInfo = {}; + + LARGE_INTEGER FileTime; + FileTime.QuadPart = Leaf->FileModifiedTime; + + PlaceholderInfo.FileBasicInfo.ChangeTime = FileTime; + PlaceholderInfo.FileBasicInfo.CreationTime = FileTime; + PlaceholderInfo.FileBasicInfo.LastAccessTime = FileTime; + PlaceholderInfo.FileBasicInfo.LastWriteTime = FileTime; + PlaceholderInfo.FileBasicInfo.FileSize = Leaf->FileSize; + PlaceholderInfo.FileBasicInfo.IsDirectory = 0; + PlaceholderInfo.FileBasicInfo.FileAttributes = FILE_ATTRIBUTE_NORMAL; + + HRESULT hRes = PrjWritePlaceholderInfo(m_PrjContext, CallbackData->FilePathName, &PlaceholderInfo, sizeof PlaceholderInfo); + + return hRes; + } + + if (auto node = result.TreeNode) + { + PRJ_PLACEHOLDER_INFO PlaceholderInfo = {}; + + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + + LARGE_INTEGER FileTime; + FileTime.QuadPart = UINT64(ft.dwHighDateTime) << 32 | ft.dwLowDateTime; + + PlaceholderInfo.FileBasicInfo.ChangeTime = FileTime; + PlaceholderInfo.FileBasicInfo.CreationTime = FileTime; + PlaceholderInfo.FileBasicInfo.LastAccessTime = FileTime; + PlaceholderInfo.FileBasicInfo.LastWriteTime = FileTime; + PlaceholderInfo.FileBasicInfo.IsDirectory = TRUE; + PlaceholderInfo.FileBasicInfo.FileAttributes = FILE_ATTRIBUTE_DIRECTORY; + + HRESULT hRes = PrjWritePlaceholderInfo(m_PrjContext, CallbackData->FilePathName, &PlaceholderInfo, sizeof PlaceholderInfo); + + return hRes; + } + + return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND); +} + +HRESULT +ProjfsProvider::GetFileStream(const PRJ_CALLBACK_DATA* CallbackData, UINT64 ByteOffset, UINT32 Length) +{ + ProjfsNamespace::LookupResult result = m_Namespace.LookupNode(CallbackData->FilePathName); + + if (const zen::LeafNode* leaf = result.LeafNode) + { + zen::CasStore& casStore = m_Namespace.CasStore(); + + const zen::IoHash& ChunkHash = leaf->ChunkHash; + + zen::IoBuffer Chunk = casStore.FindChunk(ChunkHash); + + if (!Chunk) + return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND); + + if (m_GenerateFullFiles) + { + DWORD chunkSize = (DWORD)Chunk.Size(); + + zen::StringBuilder<66> b3string; + DebugPrint("GET FILE STREAM: %s -> %d '%S'\n", ChunkHash.ToHexString(b3string).c_str(), chunkSize, CallbackData->FilePathName); + + // TODO: implement support for chunks > 4GB + ZEN_ASSERT(chunkSize == Chunk.Size()); + + HRESULT hRes = PrjWriteFileData(m_PrjContext, &CallbackData->DataStreamId, (PVOID)Chunk.Data(), 0, chunkSize); + + return hRes; + } + else + { + HRESULT hRes = PrjWriteFileData(m_PrjContext, + &CallbackData->DataStreamId, + (PVOID)(reinterpret_cast<const uint8_t*>(Chunk.Data()) + ByteOffset), + ByteOffset, + Length); + + return hRes; + } + } + + return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND); +} + +HRESULT +ProjfsProvider::QueryFileName(const PRJ_CALLBACK_DATA* CallbackData) +{ + ProjfsNamespace::LookupResult result = m_Namespace.LookupNode(CallbackData->FilePathName); + + if (result.LeafNode || result.TreeNode) + return S_OK; + + return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND); +} + +HRESULT +ProjfsProvider::NotifyOperation(const PRJ_CALLBACK_DATA* CallbackData, + BOOLEAN IsDirectory, + PRJ_NOTIFICATION NotificationType, + LPCWSTR DestinationFileName, + PRJ_NOTIFICATION_PARAMETERS* OperationParameters) +{ + ZEN_UNUSED(DestinationFileName); + + switch (NotificationType) + { + case PRJ_NOTIFICATION_FILE_OPENED: + { + auto& pc = OperationParameters->PostCreate; + + DebugPrint("*** OPEN: %s %08x '%S'\n", IsDirectory ? "(DIR)" : "-FILE", pc.NotificationMask, CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_NEW_FILE_CREATED: + { + auto& pc = OperationParameters->PostCreate; + + DebugPrint("*** NEW : %s %08x '%S'\n", IsDirectory ? "(DIR)" : "-FILE", pc.NotificationMask, CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_FILE_OVERWRITTEN: + { + auto& pc = OperationParameters->PostCreate; + + DebugPrint("*** OVER: %s %08x '%S'\n", IsDirectory ? "(DIR)" : "-FILE", pc.NotificationMask, CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_PRE_DELETE: + { + if (wcsstr(CallbackData->FilePathName, L"en-us")) + DebugPrint("*** PRE DELETE '%S'\n", CallbackData->FilePathName); + + DebugPrint("*** PRE DELETE '%S'\n", CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_PRE_RENAME: + DebugPrint("*** PRE RENAME '%S'\n", CallbackData->FilePathName); + break; + + case PRJ_NOTIFICATION_PRE_SET_HARDLINK: + DebugPrint("*** PRE SET HARDLINK '%S'\n", CallbackData->FilePathName); + break; + + case PRJ_NOTIFICATION_FILE_RENAMED: + DebugPrint("*** FILE RENAMED '%S'\n", CallbackData->FilePathName); + break; + + case PRJ_NOTIFICATION_HARDLINK_CREATED: + DebugPrint("*** HARDLINK RENAMED '%S'\n", CallbackData->FilePathName); + break; + + case PRJ_NOTIFICATION_FILE_HANDLE_CLOSED_NO_MODIFICATION: + DebugPrint("*** FILE CLOSED NO CHANGE '%S'\n", CallbackData->FilePathName); + break; + + case PRJ_NOTIFICATION_FILE_HANDLE_CLOSED_FILE_MODIFIED: + { + // const auto& handleClose = OperationParameters->FileDeletedOnHandleClose; + + DebugPrint("*** FILE CLOSED MODIFIED '%S'\n", CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_FILE_HANDLE_CLOSED_FILE_DELETED: + { + // const auto& handleClose = OperationParameters->FileDeletedOnHandleClose; + + DebugPrint("*** FILE CLOSED DELETED '%S'\n", CallbackData->FilePathName); + } + break; + + case PRJ_NOTIFICATION_FILE_PRE_CONVERT_TO_FULL: + DebugPrint("*** FILE PRE CONVERT FULL '%S'\n", CallbackData->FilePathName); + break; + } + + return S_OK; +} + +void +ProjfsProvider::CancelCommand(const PRJ_CALLBACK_DATA* CallbackData) +{ + ZEN_UNUSED(CallbackData); +} + +////////////////////////////////////////////////////////////////////////// + +struct Vfs::VfsImpl +{ + void Initialize() { m_PrjProvider.Initialize("E:\\VFS_Test", /* clean */ true); } + void Start() {} + void Stop() {} + +private: + ProjfsProvider m_PrjProvider; +}; + +////////////////////////////////////////////////////////////////////////// + +Vfs::Vfs() : m_Impl(new VfsImpl) +{ +} + +Vfs::~Vfs() +{ +} + +void +Vfs::Initialize() +{ + m_Impl->Initialize(); +} + +void +Vfs::Start() +{ +} + +void +Vfs::Stop() +{ +} + +} // namespace zen diff --git a/zenserver/vfs.h b/zenserver/vfs.h new file mode 100644 index 000000000..e77ff381b --- /dev/null +++ b/zenserver/vfs.h @@ -0,0 +1,31 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/httpserver.h> +#include <zenstore/CAS.h> + +namespace zen { + +/** + * Virtual File System serving + */ + +class Vfs +{ +public: + Vfs(); + ~Vfs(); + + void Initialize(); + + void Start(); + void Stop(); + +private: + struct VfsImpl; + + std::unique_ptr<VfsImpl> m_Impl; +}; + +} // namespace zen diff --git a/zenserver/zenserver.cpp b/zenserver/zenserver.cpp new file mode 100644 index 000000000..934fd95bc --- /dev/null +++ b/zenserver/zenserver.cpp @@ -0,0 +1,278 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/httpserver.h> +#include <zencore/iobuffer.h> +#include <zencore/refcount.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/timer.h> +#include <zencore/windows.h> +#include <zenstore/cas.h> + +#include <fmt/format.h> +#include <mimalloc-new-delete.h> +#include <mimalloc.h> +#include <spdlog/spdlog.h> +#include <asio.hpp> +#include <list> +#include <lua.hpp> +#include <optional> +#include <regex> +#include <unordered_map> + +////////////////////////////////////////////////////////////////////////// +// We don't have any doctest code in this file but this is needed to bring +// in some shared code into the executable + +#define DOCTEST_CONFIG_IMPLEMENT +#include <doctest/doctest.h> +#undef DOCTEST_CONFIG_IMPLEMENT + +////////////////////////////////////////////////////////////////////////// + +#include "casstore.h" +#include "config.h" +#include "diag/crashreport.h" +#include "diag/logging.h" + +////////////////////////////////////////////////////////////////////////// +// Services +// + +#include "admin/admin.h" +#include "cache/kvcache.h" +#include "cache/structuredcache.h" +#include "diag/diagsvcs.h" +#include "experimental/usnjournal.h" +#include "projectstore.h" +#include "testing/launch.h" +#include "upstream/jupiter.h" +#include "upstream/zen.h" +#include "zenstore/gc.h" +#include "zenstore/scrub.h" + +#define ZEN_APP_NAME "Zen store" + +class ZenServer +{ +public: + void Initialize(int BasePort, int ParentPid) + { + using namespace fmt::literals; + spdlog::info(ZEN_APP_NAME " initializing"); + + if (ParentPid) + { + m_Process.Initialize(ParentPid); + } + + // Prototype config system, let's see how this pans out + + ZenServiceConfig ServiceConfig; + ParseServiceConfig(m_DataRoot, /* out */ ServiceConfig); + + // Ok so now we're configured, let's kick things off + + zen::CasStoreConfiguration Config; + Config.RootDirectory = m_DataRoot / "CAS"; + + m_CasStore->Initialize(Config); + + spdlog::info("instantiating project service"); + + m_ProjectStore = new zen::ProjectStore(*m_CasStore, m_DataRoot / "Builds"); + m_HttpProjectService.reset(new zen::HttpProjectService{*m_CasStore, m_ProjectStore}); + m_LocalProjectService = zen::LocalProjectService::New(*m_CasStore, m_ProjectStore); + + m_HttpLaunchService = std::make_unique<zen::HttpLaunchService>(*m_CasStore); + + if (ServiceConfig.LegacyCacheEnabled) + { + spdlog::info("instantiating legacy cache service"); + m_CacheService.reset(new zen::HttpKvCacheService()); + } + else + { + spdlog::info("NOT instantiating legacy cache service"); + } + + if (ServiceConfig.StructuredCacheEnabled) + { + spdlog::info("instantiating structured cache service"); + m_StructuredCacheService.reset(new zen::HttpStructuredCacheService(m_DataRoot / "cache", *m_CasStore)); + } + else + { + spdlog::info("NOT instantiating structured cache service"); + } + + m_Http.Initialize(BasePort); + m_Http.AddEndpoint(m_HealthService); + m_Http.AddEndpoint(m_TestService); + m_Http.AddEndpoint(m_AdminService); + + if (m_HttpProjectService) + { + m_Http.AddEndpoint(*m_HttpProjectService); + } + + m_Http.AddEndpoint(m_CasService); + + if (m_CacheService) + { + spdlog::info("instantiating legacy cache service"); + m_Http.AddEndpoint(*m_CacheService); + } + + if (m_StructuredCacheService) + { + m_Http.AddEndpoint(*m_StructuredCacheService); + } + + if (m_HttpLaunchService) + { + m_Http.AddEndpoint(*m_HttpLaunchService); + } + + // Experimental + // + // m_ZenMesh.Start(1337); + } + + void Run() + { + if (m_Process.IsValid()) + { + EnqueueTimer(); + } + + if (!m_TestMode) + { + spdlog::info("__________ _________ __ "); + spdlog::info("\\____ /____ ____ / _____// |_ ___________ ____ "); + spdlog::info(" / // __ \\ / \\ \\_____ \\\\ __\\/ _ \\_ __ \\_/ __ \\ "); + spdlog::info(" / /\\ ___/| | \\ / \\| | ( <_> ) | \\/\\ ___/ "); + spdlog::info("/_______ \\___ >___| / /_______ /|__| \\____/|__| \\___ >"); + spdlog::info(" \\/ \\/ \\/ \\/ \\/ "); + } + + spdlog::info(ZEN_APP_NAME " now running"); + + m_Http.Run(m_TestMode); + + spdlog::info(ZEN_APP_NAME " exiting"); + + m_IoContext.stop(); + } + + void RequestExit(int ExitCode) + { + RequestApplicationExit(ExitCode); + m_Http.RequestExit(); + } + + void Cleanup() { spdlog::info(ZEN_APP_NAME " cleaning up"); } + + void SetTestMode(bool State) { m_TestMode = State; } + void SetDataRoot(std::filesystem::path Root) { m_DataRoot = Root; } + + void EnqueueTimer() + { + m_PidCheckTimer.expires_after(std::chrono::seconds(1)); + m_PidCheckTimer.async_wait([this](const asio::error_code&) { CheckOwnerPid(); }); + } + + void CheckOwnerPid() + { + if (m_Process.IsRunning()) + { + EnqueueTimer(); + } + else + { + spdlog::info(ZEN_APP_NAME " exiting since parent process id {} is gone", m_Process.Pid()); + + RequestExit(0); + } + } + +private: + bool m_TestMode = false; + std::filesystem::path m_DataRoot; + asio::io_context m_IoContext; + asio::steady_timer m_PidCheckTimer{m_IoContext}; + zen::Process m_Process; + + zen::HttpServer m_Http; + std::unique_ptr<zen::CasStore> m_CasStore{zen::CreateCasStore()}; + zen::CasGc m_Gc{*m_CasStore}; + zen::CasScrubber m_Scrubber{*m_CasStore}; + HttpTestService m_TestService; + zen::HttpCasService m_CasService{*m_CasStore}; + std::unique_ptr<zen::HttpKvCacheService> m_CacheService; + zen::RefPtr<zen::ProjectStore> m_ProjectStore; + zen::Ref<zen::LocalProjectService> m_LocalProjectService; + std::unique_ptr<zen::HttpLaunchService> m_HttpLaunchService; + std::unique_ptr<zen::HttpProjectService> m_HttpProjectService; + std::unique_ptr<zen::HttpStructuredCacheService> m_StructuredCacheService; + HttpAdminService m_AdminService; + HttpHealthService m_HealthService; + zen::Mesh m_ZenMesh{m_IoContext}; +}; + +int +main(int argc, char* argv[]) +{ + mi_version(); + + ZenServerOptions GlobalOptions; + ParseGlobalCliOptions(argc, argv, GlobalOptions); + InitializeCrashReporting(GlobalOptions.DataDir / "crashdumps"); + InitializeLogging(GlobalOptions); + + spdlog::info("zen cache server starting on port {}", GlobalOptions.BasePort); + + try + { + std::unique_ptr<std::thread> ShutdownThread; + std::unique_ptr<zen::NamedEvent> ShutdownEvent; + + ZenServer Cache; + Cache.SetDataRoot(GlobalOptions.DataDir); + Cache.SetTestMode(GlobalOptions.IsTest); + Cache.Initialize(GlobalOptions.BasePort, GlobalOptions.OwnerPid); + + if (!GlobalOptions.ChildId.empty()) + { + zen::ExtendableStringBuilder<64> ShutdownEventName; + ShutdownEventName << GlobalOptions.ChildId << "_Shutdown"; + ShutdownEvent.reset(new zen::NamedEvent{ShutdownEventName}); + + zen::NamedEvent ParentEvent{GlobalOptions.ChildId}; + ParentEvent.Set(); + + ShutdownThread.reset(new std::thread{[&] { + ShutdownEvent->Wait(); + spdlog::info("shutdown signal received"); + Cache.RequestExit(0); + }}); + } + + Cache.Run(); + Cache.Cleanup(); + + if (ShutdownEvent) + { + ShutdownEvent->Set(); + ShutdownThread->join(); + } + } + catch (std::exception& e) + { + SPDLOG_CRITICAL("Caught exception in main: {}", e.what()); + } + + return 0; +} diff --git a/zenserver/zenserver.vcxproj b/zenserver/zenserver.vcxproj new file mode 100644 index 000000000..b47ec2f04 --- /dev/null +++ b/zenserver/zenserver.vcxproj @@ -0,0 +1,150 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>15.0</VCProjectVersion> + <ProjectGuid>{8398D81C-B1B6-4327-82B1-06EACB8A144F}</ProjectGuid> + <Keyword>Win32Proj</Keyword> + <RootNamespace>zenserver</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_release.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\3rdparty\TraceLog\Public</IncludePath> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + <IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\3rdparty\TraceLog\Public</IncludePath> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>MaxSpeed</Optimization> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;.</AdditionalIncludeDirectories> + <LanguageStandard>stdcpplatest</LanguageStandard> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + <UACExecutionLevel>RequireAdministrator</UACExecutionLevel> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <PrecompiledHeader>NotUsing</PrecompiledHeader> + <Optimization>Disabled</Optimization> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;.</AdditionalIncludeDirectories> + <LanguageStandard>stdcpplatest</LanguageStandard> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <UACExecutionLevel>RequireAdministrator</UACExecutionLevel> + <DelayLoadDLLs>projectedfslib.dll</DelayLoadDLLs> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClInclude Include="admin\admin.h" /> + <ClInclude Include="cache\structuredcache.h" /> + <ClInclude Include="config.h" /> + <ClInclude Include="diag\crashreport.h" /> + <ClInclude Include="diag\logging.h" /> + <ClInclude Include="upstream\jupiter.h" /> + <ClInclude Include="projectstore.h" /> + <ClInclude Include="cache\cacheagent.h" /> + <ClInclude Include="cache\cachestore.h" /> + <ClInclude Include="cache\kvcache.h" /> + <ClInclude Include="testing\launch.h" /> + <ClInclude Include="casstore.h" /> + <ClInclude Include="diag\diagsvcs.h" /> + <ClInclude Include="experimental\usnjournal.h" /> + <ClInclude Include="targetver.h" /> + <ClInclude Include="upstream\zen.h" /> + <ClInclude Include="vfs.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="cache\kvcache.cpp" /> + <ClCompile Include="cache\structuredcache.cpp" /> + <ClCompile Include="config.cpp" /> + <ClCompile Include="diag\crashreport.cpp" /> + <ClCompile Include="diag\logging.cpp" /> + <ClCompile Include="projectstore.cpp" /> + <ClCompile Include="cache\cacheagent.cpp" /> + <ClCompile Include="upstream\jupiter.cpp" /> + <ClCompile Include="testing\launch.cpp" /> + <ClCompile Include="cache\cachestore.cpp" /> + <ClCompile Include="casstore.cpp" /> + <ClCompile Include="experimental\usnjournal.cpp" /> + <ClCompile Include="upstream\zen.cpp" /> + <ClCompile Include="vfs.cpp" /> + <ClCompile Include="zenserver.cpp" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\zencore\zencore.vcxproj"> + <Project>{d75bf9ab-c61e-4fff-ad59-1563430f05e2}</Project> + </ProjectReference> + <ProjectReference Include="..\zenstore\zenstore.vcxproj"> + <Project>{26cbbaeb-14c1-4efc-877d-80f48215651c}</Project> + </ProjectReference> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zenserver/zenserver.vcxproj.filters b/zenserver/zenserver.vcxproj.filters new file mode 100644 index 000000000..fcf869e19 --- /dev/null +++ b/zenserver/zenserver.vcxproj.filters @@ -0,0 +1,88 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClInclude Include="targetver.h" /> + <ClInclude Include="projectstore.h" /> + <ClInclude Include="casstore.h" /> + <ClInclude Include="vfs.h" /> + <ClInclude Include="testing\launch.h" /> + <ClInclude Include="cache\cacheagent.h"> + <Filter>cache</Filter> + </ClInclude> + <ClInclude Include="cache\cachestore.h"> + <Filter>cache</Filter> + </ClInclude> + <ClInclude Include="cache\kvcache.h"> + <Filter>cache</Filter> + </ClInclude> + <ClInclude Include="diag\diagsvcs.h"> + <Filter>diag</Filter> + </ClInclude> + <ClInclude Include="admin\admin.h"> + <Filter>admin</Filter> + </ClInclude> + <ClInclude Include="experimental\usnjournal.h"> + <Filter>experimental</Filter> + </ClInclude> + <ClInclude Include="upstream\jupiter.h"> + <Filter>upstream</Filter> + </ClInclude> + <ClInclude Include="upstream\zen.h"> + <Filter>upstream</Filter> + </ClInclude> + <ClInclude Include="cache\structuredcache.h"> + <Filter>cache</Filter> + </ClInclude> + <ClInclude Include="config.h" /> + <ClInclude Include="diag\logging.h" /> + <ClInclude Include="diag\crashreport.h" /> + </ItemGroup> + <ItemGroup> + <ClCompile Include="zenserver.cpp" /> + <ClCompile Include="projectstore.cpp" /> + <ClCompile Include="casstore.cpp" /> + <ClCompile Include="vfs.cpp" /> + <ClCompile Include="cache\cacheagent.cpp"> + <Filter>cache</Filter> + </ClCompile> + <ClCompile Include="cache\cachestore.cpp"> + <Filter>cache</Filter> + </ClCompile> + <ClCompile Include="experimental\usnjournal.cpp"> + <Filter>experimental</Filter> + </ClCompile> + <ClCompile Include="testing\launch.cpp" /> + <ClCompile Include="upstream\jupiter.cpp"> + <Filter>upstream</Filter> + </ClCompile> + <ClCompile Include="upstream\zen.cpp"> + <Filter>upstream</Filter> + </ClCompile> + <ClCompile Include="cache\structuredcache.cpp"> + <Filter>cache</Filter> + </ClCompile> + <ClCompile Include="cache\kvcache.cpp"> + <Filter>cache</Filter> + </ClCompile> + <ClCompile Include="config.cpp" /> + <ClCompile Include="diag\logging.cpp" /> + <ClCompile Include="diag\crashreport.cpp" /> + </ItemGroup> + <ItemGroup> + <Filter Include="cache"> + <UniqueIdentifier>{98e47c47-6bbe-46f5-b7cd-4b54352d964e}</UniqueIdentifier> + </Filter> + <Filter Include="diag"> + <UniqueIdentifier>{6a09a36e-fb5f-452a-ba0c-6d029240bad0}</UniqueIdentifier> + </Filter> + <Filter Include="admin"> + <UniqueIdentifier>{f72f861e-fa14-4ff8-9338-f0f84f4a8389}</UniqueIdentifier> + </Filter> + <Filter Include="experimental"> + <UniqueIdentifier>{76916270-97a6-4ec8-b323-a95b6080e245}</UniqueIdentifier> + </Filter> + <Filter Include="upstream"> + <UniqueIdentifier>{303c28c2-3607-4ef4-89bd-e3618fe37e74}</UniqueIdentifier> + </Filter> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zenstore/CAS.cpp b/zenstore/CAS.cpp new file mode 100644 index 000000000..8d81fc5cb --- /dev/null +++ b/zenstore/CAS.cpp @@ -0,0 +1,192 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenstore/cas.h> + +#include "compactcas.h" +#include "filecas.h" + +#include <doctest/doctest.h> +#include <zencore/except.h> +#include <zencore/fmtutils.h> +#include <zencore/memory.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> + +#include <spdlog/spdlog.h> + +#include <gsl/gsl-lite.hpp> + +#include <filesystem> +#include <functional> +#include <unordered_map> + +struct IUnknown; // Workaround for "combaseapi.h(229): error C2187: syntax error: 'identifier' was unexpected here" when using /permissive- +#include <atlfile.h> + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +/** + * Slightly less naive CAS store + */ +class CasImpl : public CasStore +{ +public: + CasImpl(); + virtual ~CasImpl(); + + virtual void Initialize(const CasStoreConfiguration& InConfig) override; + virtual CasStore::InsertResult InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash) override; + virtual CasStore::InsertResult InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash) override; + virtual IoBuffer FindChunk(const IoHash& ChunkHash) override; + +private: + void PickDefaultDirectory(); + + CasContainerStrategy m_TinyStrategy; + CasContainerStrategy m_SmallStrategy; + FileCasStrategy m_LargeStrategy; +}; + +CasImpl::CasImpl() : m_TinyStrategy(m_Config, m_Stats), m_SmallStrategy(m_Config, m_Stats), m_LargeStrategy(m_Config, m_Stats) +{ +} + +CasImpl::~CasImpl() +{ +} + +void +CasImpl::Initialize(const CasStoreConfiguration& InConfig) +{ + m_Config = InConfig; + + spdlog::info("initializing CAS pool at {}", m_Config.RootDirectory); + + // Ensure root directory exists - create if it doesn't exist already + + std::filesystem::create_directories(m_Config.RootDirectory); + + // Open or create manifest + + bool IsNewStore = false; + + { + std::filesystem::path ManifestPath = m_Config.RootDirectory; + ManifestPath /= ".ucas_root"; + + CAtlFile marker; + HRESULT hRes = marker.Create(ManifestPath.c_str(), GENERIC_READ, 0, OPEN_EXISTING); + + if (FAILED(hRes)) + { + IsNewStore = true; + + ExtendableStringBuilder<128> manifest; + manifest.Append("#CAS_ROOT\n"); // TODO: should write something meaningful here + manifest.Append("ID="); + zen::Oid id = zen::Oid::NewOid(); + id.ToString(manifest); + + hRes = marker.Create(ManifestPath.c_str(), GENERIC_WRITE, 0, CREATE_ALWAYS); + + if (SUCCEEDED(hRes)) + marker.Write(manifest.c_str(), (DWORD)manifest.Size()); + } + } + + // Initialize payload storage + + m_TinyStrategy.Initialize("tobs", 16, IsNewStore); + m_SmallStrategy.Initialize("sobs", 4096, IsNewStore); +} + +CasStore::InsertResult +CasImpl::InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash) +{ + if (ChunkSize < m_Config.TinyValueThreshold) + { + return m_TinyStrategy.InsertChunk(ChunkData, ChunkSize, ChunkHash); + } + else if (ChunkSize >= m_Config.HugeValueThreshold) + { + return m_LargeStrategy.InsertChunk(ChunkData, ChunkSize, ChunkHash); + } + else + { + return m_SmallStrategy.InsertChunk(ChunkData, ChunkSize, ChunkHash); + } +} + +CasStore::InsertResult +CasImpl::InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash) +{ + const uint64_t ChunkSize = Chunk.Size(); + + if (ChunkSize < m_Config.TinyValueThreshold) + { + return m_TinyStrategy.InsertChunk(Chunk, ChunkHash); + } + else if (Chunk.Size() >= m_Config.HugeValueThreshold) + { + return m_LargeStrategy.InsertChunk(Chunk, ChunkHash); + } + else + { + return m_SmallStrategy.InsertChunk(Chunk, ChunkHash); + } +} + +IoBuffer +CasImpl::FindChunk(const IoHash& ChunkHash) +{ + if (IoBuffer Found = m_SmallStrategy.FindChunk(ChunkHash)) + { + return Found; + } + + if (IoBuffer Found = m_TinyStrategy.FindChunk(ChunkHash)) + { + return Found; + } + + if (IoBuffer Found = m_LargeStrategy.FindChunk(ChunkHash)) + { + return Found; + } + + // Not found + return IoBuffer{}; +} + +////////////////////////////////////////////////////////////////////////// + +CasStore* +CreateCasStore() +{ + return new CasImpl(); + // return new FileCasImpl(); +} + +////////////////////////////////////////////////////////////////////////// +// +// Testing related code follows... +// + +void +CAS_forcelink() +{ +} + +TEST_CASE("CasStore") +{ + zen::CasStoreConfiguration config; + config.RootDirectory = "c:\\temp\\test"; + + std::unique_ptr<zen::CasStore> store{CreateCasStore()}; + store->Initialize(config); +} + +} // namespace zen diff --git a/zenstore/caslog.cpp b/zenstore/caslog.cpp new file mode 100644 index 000000000..0f918bfd8 --- /dev/null +++ b/zenstore/caslog.cpp @@ -0,0 +1,220 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenstore/cas.h> + +#include "CompactCas.h" + +#include <zencore/except.h> +#include <zencore/memory.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> + +#include <xxhash.h> + +#include <gsl/gsl-lite.hpp> + +#include <functional> + +struct IUnknown; // Workaround for "combaseapi.h(229): error C2187: syntax error: 'identifier' was unexpected here" when using /permissive- +#include <atlfile.h> +#include <filesystem> + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +uint32_t +CasLogFile::FileHeader::ComputeChecksum() +{ + return XXH32(&this->Magic, sizeof(FileHeader) - 4, 0xC0C0'BABA); +} + +CasLogFile::CasLogFile() +{ +} + +CasLogFile::~CasLogFile() +{ +} + +void +CasLogFile::Open(std::filesystem::path FileName, size_t RecordSize, bool IsCreate) +{ + m_RecordSize = RecordSize; + + const DWORD dwCreationDisposition = IsCreate ? CREATE_ALWAYS : OPEN_EXISTING; + + HRESULT hRes = m_File.Create(FileName.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open log file" /* TODO: add path */); + } + + uint64_t AppendOffset = 0; + + if (IsCreate) + { + // Initialize log by writing header + FileHeader Header = {.RecordSize = gsl::narrow<uint32_t>(RecordSize), .LogId = zen::Oid::NewOid(), .ValidatedTail = 0}; + memcpy(Header.Magic, FileHeader::MagicSequence, sizeof Header.Magic); + Header.Finalize(); + + m_File.Write(&Header, sizeof Header); + + AppendOffset = sizeof(FileHeader); + + m_Header = Header; + } + else + { + // Validate header and log contents and prepare for appending/replay + FileHeader Header; + m_File.Read(&Header, sizeof Header); + + if ((0 != memcmp(Header.Magic, FileHeader::MagicSequence, sizeof Header.Magic)) || (Header.Checksum != Header.ComputeChecksum())) + { + // TODO: provide more context! + throw std::exception("Mangled log header"); + } + + ULONGLONG Sz; + m_File.GetSize(Sz); + AppendOffset = Sz; + + m_Header = Header; + } + + m_AppendOffset = AppendOffset; +} + +void +CasLogFile::Close() +{ + // TODO: update header and maybe add trailer + + Flush(); +} + +void +CasLogFile::Replay(std::function<void(const void*)>&& Handler) +{ + ULONGLONG LogFileSize; + m_File.GetSize(LogFileSize); + + // Ensure we end up on a clean boundary + const uint64_t LogBaseOffset = sizeof(FileHeader); + const size_t LogEntryCount = (LogFileSize - LogBaseOffset) / m_RecordSize; + + if (LogEntryCount == 0) + { + return; + } + + const uint64_t LogDataSize = LogEntryCount * m_RecordSize; + + std::vector<uint8_t> ReadBuffer; + ReadBuffer.resize(LogDataSize); + + m_File.Seek(LogBaseOffset, FILE_BEGIN); + HRESULT hRes = m_File.Read(ReadBuffer.data(), gsl::narrow<DWORD>(LogDataSize)); + + zen::ThrowIfFailed(hRes, "Failed to read log file"); + + for (int i = 0; i < LogEntryCount; ++i) + { + Handler(ReadBuffer.data() + (i * m_RecordSize)); + } +} + +void +CasLogFile::Append(const void* DataPointer, uint64_t DataSize) +{ + HRESULT hRes = m_File.Write(DataPointer, gsl::narrow<DWORD>(DataSize)); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write to log file" /* TODO: add context */); + } +} + +void +CasLogFile::Flush() +{ + m_File.Flush(); +} + +////////////////////////////////////////////////////////////////////////// + +void +CasBlobFile::Open(std::filesystem::path FileName, bool isCreate) +{ + const DWORD dwCreationDisposition = isCreate ? CREATE_ALWAYS : OPEN_EXISTING; + + HRESULT hRes = m_File.Create(FileName.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ, dwCreationDisposition); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to open bucket sobs file"); + } +} + +void +CasBlobFile::Read(void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED Ovl{}; + + Ovl.Offset = DWORD(Offset & 0xffff'ffffu); + Ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_File.Read(Data, gsl::narrow<DWORD>(Size), &Ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to read from file" /* TODO: add context */); + } +} + +IoBuffer +CasBlobFile::ReadAll() +{ + IoBuffer Buffer(FileSize()); + + Read((void*)Buffer.Data(), Buffer.Size(), 0); + + return Buffer; +} + +void +CasBlobFile::Write(const void* Data, uint64_t Size, uint64_t Offset) +{ + OVERLAPPED Ovl{}; + + Ovl.Offset = DWORD(Offset & 0xffff'ffffu); + Ovl.OffsetHigh = DWORD(Offset >> 32); + + HRESULT hRes = m_File.Write(Data, gsl::narrow<DWORD>(Size), &Ovl); + + if (FAILED(hRes)) + { + throw std::system_error(GetLastError(), std::system_category(), "Failed to write to file" /* TODO: add context */); + } +} + +void +CasBlobFile::Flush() +{ + m_File.Flush(); +} + +uint64_t +CasBlobFile::FileSize() +{ + ULONGLONG Sz; + m_File.GetSize(Sz); + + return uint64_t(Sz); +} + +} // namespace zen diff --git a/zenstore/compactcas.cpp b/zenstore/compactcas.cpp new file mode 100644 index 000000000..416943b77 --- /dev/null +++ b/zenstore/compactcas.cpp @@ -0,0 +1,119 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenstore/cas.h> + +#include "CompactCas.h" + +#include <zencore/except.h> +#include <zencore/memory.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> + +#include <gsl/gsl-lite.hpp> + +#include <functional> + +struct IUnknown; // Workaround for "combaseapi.h(229): error C2187: syntax error: 'identifier' was unexpected here" when using /permissive- +#include <atlfile.h> +#include <filesystem> + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +void +CasContainerStrategy::Initialize(const std::string_view ContainerBaseName, uint64_t Alignment, bool IsNewStore) +{ + ZEN_ASSERT(IsPow2(Alignment)); + ZEN_ASSERT(!m_IsInitialized); + + m_PayloadAlignment = Alignment; + std::string BaseName(ContainerBaseName); + std::filesystem::path SobsPath = m_Config.RootDirectory / (BaseName + ".ucas"); + std::filesystem::path SidxPath = m_Config.RootDirectory / (BaseName + ".uidx"); + std::filesystem::path SlogPath = m_Config.RootDirectory / (BaseName + ".ulog"); + + m_SmallObjectFile.Open(SobsPath, IsNewStore); + m_SmallObjectIndex.Open(SidxPath, IsNewStore); + m_CasLog.Open(SlogPath, IsNewStore); + + // TODO: should validate integrity of container files here + + uint64_t MaxFileOffset = 0; + + { + // This is not technically necessary but may help future static analysis + zen::RwLock::ExclusiveLockScope _(m_LocationMapLock); + + m_CasLog.Replay([&](const CasDiskIndexEntry& Record) { + m_LocationMap[Record.Key] = Record.Location; + + MaxFileOffset = std::max<uint64_t>(MaxFileOffset, Record.Location.Offset + Record.Location.Size); + }); + } + + m_CurrentInsertOffset = (MaxFileOffset + m_PayloadAlignment - 1) & ~(m_PayloadAlignment - 1); + m_CurrentIndexOffset = m_SmallObjectIndex.FileSize(); + m_IsInitialized = true; +} + +CasStore::InsertResult +CasContainerStrategy::InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash) +{ + { + RwLock::SharedLockScope _(m_LocationMapLock); + auto KeyIt = m_LocationMap.find(ChunkHash); + + if (KeyIt != m_LocationMap.end()) + { + return CasStore::InsertResult{.New = false}; + } + } + + // New entry + + RwLock::ExclusiveLockScope _(m_InsertLock); + + const uint64_t InsertOffset = m_CurrentInsertOffset; + m_SmallObjectFile.Write(ChunkData, ChunkSize, InsertOffset); + + m_CurrentInsertOffset = (m_CurrentInsertOffset + ChunkSize + m_PayloadAlignment - 1) & ~(m_PayloadAlignment - 1); + + RwLock::ExclusiveLockScope __(m_LocationMapLock); + + CasDiskLocation Location{.Offset = InsertOffset, .Size = /* TODO FIX */ uint32_t(ChunkSize)}; + + m_LocationMap[ChunkHash] = Location; + + CasDiskIndexEntry IndexEntry{.Key = ChunkHash, .Location = Location}; + + m_CasLog.Append(IndexEntry); + + return CasStore::InsertResult{.New = true}; +} + +CasStore::InsertResult +CasContainerStrategy::InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash) +{ + return InsertChunk(Chunk.Data(), Chunk.Size(), ChunkHash); +} + +IoBuffer +CasContainerStrategy::FindChunk(const IoHash& ChunkHash) +{ + RwLock::SharedLockScope _(m_LocationMapLock); + auto KeyIt = m_LocationMap.find(ChunkHash); + + if (KeyIt != m_LocationMap.end()) + { + const CasDiskLocation& Location = KeyIt->second; + return zen::IoBufferBuilder::MakeFromFileHandle(m_SmallObjectFile.Handle(), Location.Offset, Location.Size); + } + + // Not found + + return IoBuffer(); +} + +} // namespace zen diff --git a/zenstore/compactcas.h b/zenstore/compactcas.h new file mode 100644 index 000000000..4d318c2e2 --- /dev/null +++ b/zenstore/compactcas.h @@ -0,0 +1,66 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> +#include <zencore/windows.h> +#include <zenstore/cas.h> +#include <zenstore/caslog.h> + +#include <atlfile.h> +#include <functional> + +namespace zen { + +////////////////////////////////////////////////////////////////////////// + +#pragma pack(push) +#pragma pack(1) + +struct CasDiskLocation +{ + uint64_t Offset; + uint32_t Size; // TODO: Make this more like the IoStore index so we can store larger chunks (should be five bytes) +}; + +struct CasDiskIndexEntry +{ + IoHash Key; + CasDiskLocation Location; +}; + +#pragma pack(pop) + +static_assert(sizeof(CasDiskIndexEntry) == 32); + +struct CasContainerStrategy +{ + CasContainerStrategy(const CasStoreConfiguration& Config, CasStore::Stats& Stats) : m_Config(Config), m_Stats(Stats) {} + CasStore::InsertResult InsertChunk(const void* chunkData, size_t chunkSize, const IoHash& chunkHash); + CasStore::InsertResult InsertChunk(IoBuffer Chunk, const IoHash& chunkHash); + IoBuffer FindChunk(const IoHash& chunkHash); + void Initialize(const std::string_view ContainerBaseName, uint64_t Alignment, bool IsNewStore); + +private: + const CasStoreConfiguration& m_Config; + CasStore::Stats& m_Stats; + uint64_t m_PayloadAlignment = 1 << 4; + bool m_IsInitialized = false; + CasBlobFile m_SmallObjectFile; + CasBlobFile m_SmallObjectIndex; + TCasLogFile<CasDiskIndexEntry> m_CasLog; + + RwLock m_LocationMapLock; + std::unordered_map<IoHash, CasDiskLocation, IoHash::Hasher> m_LocationMap; + RwLock m_InsertLock; // used to serialize inserts + std::atomic<uint64_t> m_CurrentInsertOffset = 0; + std::atomic<uint64_t> m_CurrentIndexOffset = 0; +}; + +} // namespace zen diff --git a/zenstore/filecas.cpp b/zenstore/filecas.cpp new file mode 100644 index 000000000..84a06c3be --- /dev/null +++ b/zenstore/filecas.cpp @@ -0,0 +1,237 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "FileCas.h" + +#include <zencore/except.h> +#include <zencore/memory.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> + +#include <gsl/gsl-lite.hpp> + +#include <functional> +#include <unordered_map> + +struct IUnknown; // Workaround for "combaseapi.h(229): error C2187: syntax error: 'identifier' was unexpected here" when using /permissive- +#include <atlfile.h> +#include <filesystem> + +// Used for getting My Documents for default CAS +#include <ShlObj.h> +#pragma comment(lib, "shell32.lib") + +////////////////////////////////////////////////////////////////////////// + +namespace zen { + +WideStringBuilderBase& +FileCasStrategy::MakeShardedPath(WideStringBuilderBase& ShardedPath, const IoHash& ChunkHash, size_t& OutShard2len) +{ + ExtendableStringBuilder<96> HashString; + ChunkHash.ToHexString(HashString); + + const char* str = HashString.c_str(); + + // Shard into a path with two directory levels containing 12 bits and 8 bits + // respectively. + // + // This results in a maximum of 4096 * 256 directories + // + // The numbers have been chosen somewhat arbitrarily but are large to scale + // to very large chunk repositories. It may or may not make sense to make + // this a configurable policy, and it would probably be a good idea to + // measure performance for different policies and chunk counts + + ShardedPath.AppendAsciiRange(str, str + 3); + + ShardedPath.Append('\\'); + ShardedPath.AppendAsciiRange(str + 3, str + 5); + OutShard2len = ShardedPath.Size(); + + ShardedPath.Append('\\'); + ShardedPath.AppendAsciiRange(str + 6, str + 64); + + return ShardedPath; +} + +CasStore::InsertResult +FileCasStrategy::InsertChunk(IoBuffer Chunk, const IoHash& ChunkHash) +{ + return InsertChunk(Chunk.Data(), Chunk.Size(), ChunkHash); +} + +CasStore::InsertResult +FileCasStrategy::InsertChunk(const void* const ChunkData, const size_t ChunkSize, const IoHash& ChunkHash) +{ + size_t Shard2len = 0; + ExtendableWideStringBuilder<128> ShardedPath; + ShardedPath.Append(m_Config.RootDirectory.c_str()); + ShardedPath.Append(std::filesystem::path::preferred_separator); + MakeShardedPath(ShardedPath, ChunkHash, /* out */ Shard2len); + + // See if file already exists + // + // Future improvement: maintain Bloom filter to avoid expensive file system probes? + + CAtlFile PayloadFile; + + HRESULT hRes = PayloadFile.Create(ShardedPath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + + if (SUCCEEDED(hRes)) + { + // If we succeeded in opening the file then we don't need to do anything else because it already exists and should contain the + // content we were about to insert + return CasStore::InsertResult{.New = false}; + } + + PayloadFile.Close(); + + RwLock::ExclusiveLockScope _(LockForHash(ChunkHash)); + + // For now, use double-checked locking to see if someone else was first + + hRes = PayloadFile.Create(ShardedPath.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING); + + if (SUCCEEDED(hRes)) + { + // If we succeeded in opening the file then we don't need to do anything + // else because someone else managed to create the file before we did. Just return. + return {.New = false}; + } + + auto InternalCreateFile = [&] { return PayloadFile.Create(ShardedPath.c_str(), GENERIC_WRITE, FILE_SHARE_DELETE, CREATE_ALWAYS); }; + + hRes = InternalCreateFile(); + + if (hRes == HRESULT_FROM_WIN32(ERROR_PATH_NOT_FOUND)) + { + // Ensure parent directories exist + + std::filesystem::create_directories(std::wstring_view(ShardedPath.c_str(), Shard2len)); + + hRes = InternalCreateFile(); + } + + if (FAILED(hRes)) + { + throw WindowsException(hRes, "Failed to open shard file"); + } + + size_t ChunkRemain = ChunkSize; + auto ChunkCursor = reinterpret_cast<const uint8_t*>(ChunkData); + + while (ChunkRemain != 0) + { + uint32_t ByteCount = uint32_t(std::min<size_t>(1024 * 1024ull, ChunkRemain)); + + PayloadFile.Write(ChunkCursor, ByteCount); + + ChunkCursor += ByteCount; + ChunkRemain -= ByteCount; + } + + AtomicIncrement(m_Stats.PutCount); + AtomicAdd(m_Stats.PutBytes, ChunkSize); + + return {.New = true}; +} + +IoBuffer +FileCasStrategy::FindChunk(const IoHash& ChunkHash) +{ + size_t Shard2len = 0; + ExtendableWideStringBuilder<128> ShardedPath; + ShardedPath.Append(m_Config.RootDirectory.c_str()); + ShardedPath.Append(std::filesystem::path::preferred_separator); + MakeShardedPath(ShardedPath, ChunkHash, /* out */ Shard2len); + + RwLock::SharedLockScope _(LockForHash(ChunkHash)); + + auto Chunk = IoBufferBuilder::MakeFromFile(ShardedPath.c_str()); + + if (Chunk) + { + AtomicIncrement(m_Stats.GetCount); + AtomicAdd(m_Stats.GetBytes, Chunk.Size()); + } + + return Chunk; +} + +/** + * Straightforward file-per-chunk CAS store implementation + */ +class FileCasImpl : public CasStore +{ +public: + FileCasImpl() : m_Strategy(m_Config, m_Stats) {} + virtual ~FileCasImpl() = default; + + void PickDefaultDirectory() + { + if (m_Config.RootDirectory.empty()) + { + // Pick sensible default + + WCHAR myDocumentsDir[MAX_PATH]; + HRESULT hRes = SHGetFolderPathW(NULL, + CSIDL_PERSONAL /* My Documents */, + NULL, + SHGFP_TYPE_CURRENT, + /* out */ myDocumentsDir); + + if (SUCCEEDED(hRes)) + { + wcscat_s(myDocumentsDir, L"\\zen\\DefaultCAS"); + + m_Config.RootDirectory = myDocumentsDir; + } + } + } + + virtual void Initialize(const CasStoreConfiguration& InConfig) override + { + m_Config = InConfig; + + if (m_Config.RootDirectory.empty()) + { + PickDefaultDirectory(); + } + + // Ensure root directory exists - create if it doesn't exist already + + std::filesystem::create_directories(m_Config.RootDirectory); + + std::filesystem::path filepath = m_Config.RootDirectory; + filepath /= ".cas_root"; + + CAtlFile marker; + HRESULT hRes = marker.Create(filepath.c_str(), GENERIC_READ, 0, OPEN_EXISTING); + + if (FAILED(hRes)) + { + ExtendableStringBuilder<128> manifest; + manifest.Append("CAS_ROOT"); + hRes = marker.Create(filepath.c_str(), GENERIC_WRITE, 0, CREATE_ALWAYS); + + if (SUCCEEDED(hRes)) + marker.Write(manifest.c_str(), (DWORD)manifest.Size()); + } + } + + virtual CasStore::InsertResult InsertChunk(const void* chunkData, size_t chunkSize, const IoHash& chunkHash) override + { + return m_Strategy.InsertChunk(chunkData, chunkSize, chunkHash); + } + virtual CasStore::InsertResult InsertChunk(IoBuffer Chunk, const IoHash& chunkHash) override + { + return m_Strategy.InsertChunk(Chunk, chunkHash); + } + virtual IoBuffer FindChunk(const IoHash& chunkHash) override { return m_Strategy.FindChunk(chunkHash); } + +private: + FileCasStrategy m_Strategy; +}; + +} // namespace zen diff --git a/zenstore/filecas.h b/zenstore/filecas.h new file mode 100644 index 000000000..21ad8ba7c --- /dev/null +++ b/zenstore/filecas.h @@ -0,0 +1,32 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zenstore/cas.h> + +namespace zen { + +struct FileCasStrategy +{ + FileCasStrategy(const CasStoreConfiguration& Config, CasStore::Stats& Stats) : m_Config(Config), m_Stats(Stats) {} + CasStore::InsertResult InsertChunk(const void* chunkData, size_t chunkSize, const IoHash& chunkHash); + CasStore::InsertResult InsertChunk(IoBuffer Chunk, const IoHash& chunkHash); + IoBuffer FindChunk(const IoHash& chunkHash); + +private: + const CasStoreConfiguration& m_Config; + CasStore::Stats& m_Stats; + RwLock m_Lock; + RwLock m_ShardLocks[256]; // TODO: these should be spaced out so they don't share cache lines + + inline RwLock& LockForHash(const IoHash& Hash) { return m_ShardLocks[Hash.Hash[19]]; } + static WideStringBuilderBase& MakeShardedPath(WideStringBuilderBase& ShardedPath, const IoHash& ChunkHash, size_t& OutShard2len); +}; + +} // namespace zen diff --git a/zenstore/gc.cpp b/zenstore/gc.cpp new file mode 100644 index 000000000..bfb8f015e --- /dev/null +++ b/zenstore/gc.cpp @@ -0,0 +1,26 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenstore/gc.h> + +namespace zen { + +CasGc::CasGc(CasStore& Store) : m_CasStore(Store) +{ +} + +CasGc::~CasGc() +{ +} + +void +CasGc::CollectGarbage() +{ +} + +void +CasGc::OnNewReferences(std::span<IoHash> Hashes) +{ + ZEN_UNUSED(Hashes); +} + +} // namespace zen diff --git a/zenstore/include/zenstore/CAS.h b/zenstore/include/zenstore/CAS.h new file mode 100644 index 000000000..8b9a66e3f --- /dev/null +++ b/zenstore/include/zenstore/CAS.h @@ -0,0 +1,66 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/blake3.h> +#include <zencore/iobuffer.h> +#include <zencore/iohash.h> +#include <zencore/refcount.h> +#include <atomic> +#include <filesystem> +#include <memory> +#include <string> + +namespace zen { + +struct CasStoreConfiguration +{ + // Root directory for CAS store -- if not specified a default folder will be assigned in 'Documents\zen' + std::filesystem::path RootDirectory; + + // Threshold below which values are considered 'tiny' and managed using the 'tiny values' strategy + uint64_t TinyValueThreshold = 1024; + + // Threshold above which values are considered 'tiny' and managed using the 'huge values' strategy + uint64_t HugeValueThreshold = 1024 * 1024; +}; + +class CasStore +{ +public: + virtual ~CasStore() = default; + + struct Stats + { + uint64_t PutBytes = 0; + uint64_t PutCount = 0; + + uint64_t GetBytes = 0; + uint64_t GetCount = 0; + }; + + const CasStoreConfiguration& Config() { return m_Config; } + const Stats& GetStats() const { return m_Stats; } + + struct InsertResult + { + bool New = false; + }; + + virtual void Initialize(const CasStoreConfiguration& Config) = 0; + virtual InsertResult InsertChunk(const void* ChunkData, size_t ChunkSize, const IoHash& ChunkHash) = 0; + virtual InsertResult InsertChunk(IoBuffer Data, const IoHash& ChunkHash) = 0; + virtual IoBuffer FindChunk(const IoHash& ChunkHash) = 0; + +protected: + CasStoreConfiguration m_Config; + Stats m_Stats; +}; + +ZENCORE_API CasStore* CreateCasStore(); + +void CAS_forcelink(); + +} // namespace zen diff --git a/zenstore/include/zenstore/caslog.h b/zenstore/include/zenstore/caslog.h new file mode 100644 index 000000000..b318577d7 --- /dev/null +++ b/zenstore/include/zenstore/caslog.h @@ -0,0 +1,96 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/zencore.h> + +#include <zencore/iobuffer.h> +#include <zencore/string.h> +#include <zencore/thread.h> +#include <zencore/uid.h> +#include <zencore/windows.h> +#include <zenstore/cas.h> + +#include <atlfile.h> +#include <functional> + +namespace zen { + +class CasLogFile +{ +public: + CasLogFile(); + ~CasLogFile(); + + void Open(std::filesystem::path FileName, size_t RecordSize, bool isCreate); + void Append(const void* DataPointer, uint64_t DataSize); + void Replay(std::function<void(const void*)>&& Handler); + void Flush(); + void Close(); + +private: + struct FileHeader + { + uint8_t Magic[16]; + uint32_t RecordSize = 0; + zen::Oid LogId; + uint32_t ValidatedTail = 0; + uint32_t Pad[6]; + uint32_t Checksum = 0; + + static const inline uint8_t MagicSequence[16] = {'.', '-', '=', ' ', 'C', 'A', 'S', 'L', 'O', 'G', 'v', '1', ' ', '=', '-', '.'}; + + ZENCORE_API uint32_t ComputeChecksum(); + void Finalize() { Checksum = ComputeChecksum(); } + }; + + static_assert(sizeof(FileHeader) == 64); + +private: + CAtlFile m_File; + FileHeader m_Header; + size_t m_RecordSize = 1; + uint64_t m_AppendOffset = 0; +}; + +template<typename T> +class TCasLogFile : public CasLogFile +{ +public: + // This should be called before the Replay() is called to do some basic sanity checking + bool Initialize() { return true; } + + void Replay(std::invocable<const T&> auto Handler) + { + CasLogFile::Replay([&](const void* VoidPtr) { + const T& Record = *reinterpret_cast<const T*>(VoidPtr); + + Handler(Record); + }); + } + + void Append(const T& Record) { CasLogFile::Append(&Record, sizeof Record); } + void Open(std::filesystem::path FileName, bool IsCreate) { CasLogFile::Open(FileName, sizeof(T), IsCreate); } +}; + +////////////////////////////////////////////////////////////////////////// +// +// This should go in its own header +// + +class CasBlobFile +{ +public: + void Open(std::filesystem::path FileName, bool IsCreate); + void Read(void* Data, uint64_t Size, uint64_t Offset); + void Write(const void* Data, uint64_t Size, uint64_t Offset); + void Flush(); + uint64_t FileSize(); + void* Handle() { return m_File; } + IoBuffer ReadAll(); + +private: + CAtlFile m_File; +}; + +} // namespace zen diff --git a/zenstore/include/zenstore/gc.h b/zenstore/include/zenstore/gc.h new file mode 100644 index 000000000..055843547 --- /dev/null +++ b/zenstore/include/zenstore/gc.h @@ -0,0 +1,28 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iohash.h> + +#include <span> + +namespace zen { + +class CasStore; +struct IoHash; + +class CasGc +{ +public: + CasGc(CasStore& Store); + ~CasGc(); + + void CollectGarbage(); + + void OnNewReferences(std::span<IoHash> Hashes); + +private: + CasStore& m_CasStore; +}; + +} // namespace zen diff --git a/zenstore/include/zenstore/scrub.h b/zenstore/include/zenstore/scrub.h new file mode 100644 index 000000000..5a34d4860 --- /dev/null +++ b/zenstore/include/zenstore/scrub.h @@ -0,0 +1,24 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/iohash.h> + +#include <span> + +namespace zen { + +class CasStore; +struct IoHash; + +class CasScrubber +{ +public: + CasScrubber(CasStore& Store); + ~CasScrubber(); + +private: + CasStore& m_CasStore; +}; + +} // namespace zen diff --git a/zenstore/scrub.cpp b/zenstore/scrub.cpp new file mode 100644 index 000000000..4df337349 --- /dev/null +++ b/zenstore/scrub.cpp @@ -0,0 +1,15 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <zenstore/scrub.h> + +namespace zen { + +CasScrubber::CasScrubber(CasStore& Store) : m_CasStore(Store) +{ +} + +CasScrubber::~CasScrubber() +{ +} + +} // namespace zen diff --git a/zenstore/zenstore.vcxproj b/zenstore/zenstore.vcxproj new file mode 100644 index 000000000..4a39e826d --- /dev/null +++ b/zenstore/zenstore.vcxproj @@ -0,0 +1,121 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <ItemGroup> + <ClCompile Include="CAS.cpp" /> + <ClCompile Include="caslog.cpp" /> + <ClCompile Include="compactcas.cpp" /> + <ClCompile Include="filecas.cpp" /> + <ClCompile Include="gc.cpp" /> + <ClCompile Include="scrub.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="compactcas.h" /> + <ClInclude Include="filecas.h" /> + <ClInclude Include="include\zenstore\gc.h" /> + <ClInclude Include="include\zenstore\scrub.h" /> + <ClInclude Include="include\zenstore\CAS.h" /> + <ClInclude Include="include\zenstore\caslog.h" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\zencore\zencore.vcxproj"> + <Project>{d75bf9ab-c61e-4fff-ad59-1563430f05e2}</Project> + </ProjectReference> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{26cbbaeb-14c1-4efc-877d-80f48215651c}</ProjectGuid> + <RootNamespace>zenstore</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zenfs_common.props" /> + <Import Project="..\zen_base_release.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>../zencore/include;./include</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile> + <AdditionalIncludeDirectories>../zencore/include;./include</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem> + </SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zenstore/zenstore.vcxproj.filters b/zenstore/zenstore.vcxproj.filters new file mode 100644 index 000000000..6ab5a7cb2 --- /dev/null +++ b/zenstore/zenstore.vcxproj.filters @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="CAS.cpp" /> + <ClCompile Include="caslog.cpp" /> + <ClCompile Include="compactcas.cpp" /> + <ClCompile Include="filecas.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="compactcas.h" /> + <ClInclude Include="filecas.h" /> + <ClInclude Include="include\zenstore\CAS.h" /> + <ClInclude Include="include\zenstore\caslog.h" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zentest-appstub/zentest-appstub.cpp b/zentest-appstub/zentest-appstub.cpp new file mode 100644 index 000000000..bea50270d --- /dev/null +++ b/zentest-appstub/zentest-appstub.cpp @@ -0,0 +1,24 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include <stdio.h> +#include <cstdlib> +#include <cstring> +#include <thread> + +using namespace std::chrono_literals; + +int +main(int argc, char* argv[]) +{ + for (int i = 0; i < argc; ++i) + { + if (std::strncmp(argv[i], "-t=", 3) == 0) + { + int sleeptime = std::atoi(argv[i] + 3); + + printf("[zentest] sleeping for %ds!", sleeptime); + + std::this_thread::sleep_for(sleeptime * 1s); + } + } +} diff --git a/zentest-appstub/zentest-appstub.vcxproj b/zentest-appstub/zentest-appstub.vcxproj new file mode 100644 index 000000000..cf8fd3c5e --- /dev/null +++ b/zentest-appstub/zentest-appstub.vcxproj @@ -0,0 +1,152 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|Win32"> + <Configuration>Debug</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|Win32"> + <Configuration>Release</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{7ffc7e77-d038-44e9-8d84-41918c355f29}</ProjectGuid> + <RootNamespace>zentestappstub</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" /> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <LanguageStandard>stdcpplatest</LanguageStandard> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <LanguageStandard>stdcpplatest</LanguageStandard> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="zentest-appstub.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zentest-appstub/zentest-appstub.vcxproj.filters b/zentest-appstub/zentest-appstub.vcxproj.filters new file mode 100644 index 000000000..23c71deb6 --- /dev/null +++ b/zentest-appstub/zentest-appstub.vcxproj.filters @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="zentest-appstub.cpp" /> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/zentestutil/include/zenserverprocess.h b/zentestutil/include/zenserverprocess.h new file mode 100644 index 000000000..3f86f283c --- /dev/null +++ b/zentestutil/include/zenserverprocess.h @@ -0,0 +1,53 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#pragma once + +#include <zencore/thread.h> + +#include <spdlog/spdlog.h> + +#include <filesystem> + +class ZenTestEnvironment +{ +public: + ZenTestEnvironment(); + ~ZenTestEnvironment(); + + void Initialize(std::filesystem::path ProgramBaseDir, std::filesystem::path TestBaseDir); + + std::filesystem::path CreateNewTestDir(); + std::filesystem::path ProgramBaseDir() const { return m_ProgramBaseDir; } + bool IsInitialized() const { return m_IsInitialized; } + +private: + std::filesystem::path m_ProgramBaseDir; + std::filesystem::path m_TestBaseDir; + bool m_IsInitialized = false; +}; + +struct ZenServerInstance +{ + ZenServerInstance(ZenTestEnvironment& TestEnvironment); + ~ZenServerInstance(); + + void SignalShutdown() { m_ShutdownEvent.Set(); } + void WaitUntilReady() { m_ReadyEvent.Wait(); } + void EnableTermination() { m_Terminate = true; } + + void SetTestDir(std::filesystem::path TestDir) + { + ZEN_ASSERT(!m_Process.IsValid()); + m_TestDir = TestDir; + } + + void SpawnServer(int BasePort); + +private: + ZenTestEnvironment& m_Env; + zen::Process m_Process; + zen::Event m_ReadyEvent; + zen::Event m_ShutdownEvent; + bool m_Terminate = false; + std::filesystem::path m_TestDir; +}; diff --git a/zentestutil/zenserverprocess.cpp b/zentestutil/zenserverprocess.cpp new file mode 100644 index 000000000..9d62d692f --- /dev/null +++ b/zentestutil/zenserverprocess.cpp @@ -0,0 +1,155 @@ +// Copyright Epic Games, Inc. All Rights Reserved. + +#include "zenserverprocess.h" + +#include <zencore/filesystem.h> +#include <zencore/fmtutils.h> +#include <zencore/string.h> + +#include <spdlog/spdlog.h> + +////////////////////////////////////////////////////////////////////////// + +std::atomic<int> TestCounter{0}; + +ZenTestEnvironment::ZenTestEnvironment() +{ +} + +ZenTestEnvironment::~ZenTestEnvironment() +{ +} + +void +ZenTestEnvironment::Initialize(std::filesystem::path ProgramBaseDir, std::filesystem::path TestBaseDir) +{ + m_ProgramBaseDir = ProgramBaseDir; + m_TestBaseDir = TestBaseDir; + + spdlog::info("Cleaning '{}'", TestBaseDir); + zen::DeleteDirectories(TestBaseDir.c_str()); + + m_IsInitialized = true; +} + +std::filesystem::path +ZenTestEnvironment::CreateNewTestDir() +{ + using namespace std::literals; + + zen::ExtendableWideStringBuilder<256> TestDir; + TestDir << "test"sv << int64_t(++TestCounter); + + std::filesystem::path TestPath = m_TestBaseDir / TestDir.c_str(); + + zen::CreateDirectories(TestPath.c_str()); + + return TestPath; +} + +////////////////////////////////////////////////////////////////////////// + +std::atomic<int> ChildIdCounter{0}; + +ZenServerInstance::ZenServerInstance(ZenTestEnvironment& TestEnvironment) : m_Env(TestEnvironment) +{ + ZEN_ASSERT(TestEnvironment.IsInitialized()); +} + +ZenServerInstance::~ZenServerInstance() +{ + if (m_Process.IsValid()) + { + if (m_Terminate) + { + spdlog::info("Terminating zenserver process"); + m_Process.Terminate(111); + } + else + { + SignalShutdown(); + m_Process.Wait(); + } + } +} + +void +ZenServerInstance::SpawnServer(int BasePort) +{ + ZEN_ASSERT(!m_Process.IsValid()); // Only spawn once + + const std::filesystem::path BaseDir = m_Env.ProgramBaseDir(); + const std::filesystem::path Executable = BaseDir / "zenserver.exe"; + + const int MyPid = _getpid(); + const int ChildId = ++ChildIdCounter; + + zen::ExtendableStringBuilder<32> ChildEventName; + ChildEventName << "Zen_Child_" << ChildId; + zen::NamedEvent ChildEvent{ChildEventName}; + + zen::ExtendableStringBuilder<32> ChildShutdownEventName; + ChildShutdownEventName << "Zen_Child_" << ChildId; + ChildShutdownEventName << "_Shutdown"; + zen::NamedEvent ChildShutdownEvent{ChildShutdownEventName}; + + zen::ExtendableStringBuilder<32> LogId; + LogId << "Zen" << ChildId; + + zen::ExtendableWideStringBuilder<128> CommandLine; + CommandLine << "\""; + CommandLine.Append(Executable.c_str()); + CommandLine << "\" --test --owner-pid "; + CommandLine << MyPid; + CommandLine << " "; + CommandLine << "--port " << BasePort; + CommandLine << " --child-id " << ChildEventName; + CommandLine << " --log-id " << LogId; + + if (!m_TestDir.empty()) + { + CommandLine << " --data-dir "; + CommandLine << m_TestDir.c_str(); + } + + std::filesystem::path CurrentDirectory = std::filesystem::current_path(); + + spdlog::debug("Spawning server"); + + PROCESS_INFORMATION ProcessInfo{}; + STARTUPINFO Sinfo{.cb = sizeof(STARTUPINFO)}; + + DWORD CreationFlags = 0; // CREATE_NEW_CONSOLE; + const bool InheritHandles = false; + void* Environment = nullptr; + LPSECURITY_ATTRIBUTES ProcessAttributes = nullptr; + LPSECURITY_ATTRIBUTES ThreadAttributes = nullptr; + + BOOL Success = CreateProcessW(Executable.c_str(), + (LPWSTR)CommandLine.c_str(), + ProcessAttributes, + ThreadAttributes, + InheritHandles, + CreationFlags, + Environment, + CurrentDirectory.c_str(), + &Sinfo, + &ProcessInfo); + + if (Success == FALSE) + { + std::error_code err(::GetLastError(), std::system_category()); + + spdlog::error("Server spawn failed: {}", err.message()); + + throw std::system_error(err, "failed to create server process"); + } + + spdlog::debug("Server spawned OK"); + + CloseHandle(ProcessInfo.hThread); + + m_Process.Initialize(ProcessInfo.hProcess); + m_ReadyEvent = std::move(ChildEvent); + m_ShutdownEvent = std::move(ChildShutdownEvent); +} diff --git a/zentestutil/zentestutil.vcxproj b/zentestutil/zentestutil.vcxproj new file mode 100644 index 000000000..8213763fc --- /dev/null +++ b/zentestutil/zentestutil.vcxproj @@ -0,0 +1,104 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <VCProjectVersion>16.0</VCProjectVersion> + <Keyword>Win32Proj</Keyword> + <ProjectGuid>{77f8315d-b21d-4db0-9a6f-2d3359f88a70}</ProjectGuid> + <RootNamespace>zentestutil</RootNamespace> + <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>StaticLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v142</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="Shared"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zen_base_debug.props" /> + <Import Project="..\zenfs_common.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\zen_base_release.props" /> + <Import Project="..\zenfs_common.props" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <LinkIncremental>true</LinkIncremental> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <LinkIncremental>false</LinkIncremental> + </PropertyGroup> + <PropertyGroup Label="Vcpkg"> + <VcpkgEnableManifest>true</VcpkgEnableManifest> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <VcpkgUseStatic>true</VcpkgUseStatic> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;.\include</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <SDLCheck>true</SDLCheck> + <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <ConformanceMode>true</ConformanceMode> + <AdditionalIncludeDirectories>..\zencore\include;..\zenstore\include;.\include</AdditionalIncludeDirectories> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <GenerateDebugInformation>true</GenerateDebugInformation> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="zenserverprocess.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="zenserverprocess.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/zentestutil/zentestutil.vcxproj.filters b/zentestutil/zentestutil.vcxproj.filters new file mode 100644 index 000000000..1afefcdf7 --- /dev/null +++ b/zentestutil/zentestutil.vcxproj.filters @@ -0,0 +1,9 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <ClCompile Include="zenserverprocess.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="zenserverprocess.h" /> + </ItemGroup> +</Project>
\ No newline at end of file |