xxhash.h (278450B)
1 /* 2 * xxHash - Extremely Fast Hash algorithm 3 * Header File 4 * Copyright (C) 2012-2023 Yann Collet 5 * 6 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are 10 * met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above 15 * copyright notice, this list of conditions and the following disclaimer 16 * in the documentation and/or other materials provided with the 17 * distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * You can contact the author at: 32 * - xxHash homepage: https://www.xxhash.com 33 * - xxHash source repository: https://github.com/Cyan4973/xxHash 34 */ 35 36 /*! 37 * @mainpage xxHash 38 * 39 * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed 40 * limits. 41 * 42 * It is proposed in four flavors, in three families: 43 * 1. @ref XXH32_family 44 * - Classic 32-bit hash function. Simple, compact, and runs on almost all 45 * 32-bit and 64-bit systems. 46 * 2. @ref XXH64_family 47 * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most 48 * 64-bit systems (but _not_ 32-bit systems). 49 * 3. @ref XXH3_family 50 * - Modern 64-bit and 128-bit hash function family which features improved 51 * strength and performance across the board, especially on smaller data. 52 * It benefits greatly from SIMD and 64-bit without requiring it. 53 * 54 * Benchmarks 55 * --- 56 * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. 57 * The open source benchmark program is compiled with clang v10.0 using -O3 flag. 58 * 59 * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | 60 * | -------------------- | ------- | ----: | ---------------: | ------------------: | 61 * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | 62 * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | 63 * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | 64 * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | 65 * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | 66 * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | 67 * | RAM sequential read | | N/A | 28.0 GB/s | N/A | 68 * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | 69 * | City64 | | 64 | 22.0 GB/s | 76.6 | 70 * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | 71 * | City128 | | 128 | 21.7 GB/s | 57.7 | 72 * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | 73 * | XXH64() | | 64 | 19.4 GB/s | 71.0 | 74 * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | 75 * | Mum | | 64 | 18.0 GB/s | 67.0 | 76 * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | 77 * | XXH32() | | 32 | 9.7 GB/s | 71.9 | 78 * | City32 | | 32 | 9.1 GB/s | 66.0 | 79 * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | 80 * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | 81 * | SipHash* | | 64 | 3.0 GB/s | 43.2 | 82 * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | 83 * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | 84 * | FNV64 | | 64 | 1.2 GB/s | 62.7 | 85 * | Blake2* | | 256 | 1.1 GB/s | 5.1 | 86 * | SHA1* | | 160 | 0.8 GB/s | 5.6 | 87 * | MD5* | | 128 | 0.6 GB/s | 7.8 | 88 * @note 89 * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, 90 * even though it is mandatory on x64. 91 * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic 92 * by modern standards. 93 * - Small data velocity is a rough average of algorithm's efficiency for small 94 * data. For more accurate information, see the wiki. 95 * - More benchmarks and strength tests are found on the wiki: 96 * https://github.com/Cyan4973/xxHash/wiki 97 * 98 * Usage 99 * ------ 100 * All xxHash variants use a similar API. Changing the algorithm is a trivial 101 * substitution. 102 * 103 * @pre 104 * For functions which take an input and length parameter, the following 105 * requirements are assumed: 106 * - The range from [`input`, `input + length`) is valid, readable memory. 107 * - The only exception is if the `length` is `0`, `input` may be `NULL`. 108 * - For C++, the objects must have the *TriviallyCopyable* property, as the 109 * functions access bytes directly as if it was an array of `unsigned char`. 110 * 111 * @anchor single_shot_example 112 * **Single Shot** 113 * 114 * These functions are stateless functions which hash a contiguous block of memory, 115 * immediately returning the result. They are the easiest and usually the fastest 116 * option. 117 * 118 * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() 119 * 120 * @code{.c} 121 * #include <string.h> 122 * #include "xxhash.h" 123 * 124 * // Example for a function which hashes a null terminated string with XXH32(). 125 * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) 126 * { 127 * // NULL pointers are only valid if the length is zero 128 * size_t length = (string == NULL) ? 0 : strlen(string); 129 * return XXH32(string, length, seed); 130 * } 131 * @endcode 132 * 133 * 134 * @anchor streaming_example 135 * **Streaming** 136 * 137 * These groups of functions allow incremental hashing of unknown size, even 138 * more than what would fit in a size_t. 139 * 140 * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() 141 * 142 * @code{.c} 143 * #include <stdio.h> 144 * #include <assert.h> 145 * #include "xxhash.h" 146 * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). 147 * XXH64_hash_t hashFile(FILE* f) 148 * { 149 * // Allocate a state struct. Do not just use malloc() or new. 150 * XXH3_state_t* state = XXH3_createState(); 151 * assert(state != NULL && "Out of memory!"); 152 * // Reset the state to start a new hashing session. 153 * XXH3_64bits_reset(state); 154 * char buffer[4096]; 155 * size_t count; 156 * // Read the file in chunks 157 * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { 158 * // Run update() as many times as necessary to process the data 159 * XXH3_64bits_update(state, buffer, count); 160 * } 161 * // Retrieve the finalized hash. This will not change the state. 162 * XXH64_hash_t result = XXH3_64bits_digest(state); 163 * // Free the state. Do not use free(). 164 * XXH3_freeState(state); 165 * return result; 166 * } 167 * @endcode 168 * 169 * Streaming functions generate the xxHash value from an incremental input. 170 * This method is slower than single-call functions, due to state management. 171 * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. 172 * 173 * An XXH state must first be allocated using `XXH*_createState()`. 174 * 175 * Start a new hash by initializing the state with a seed using `XXH*_reset()`. 176 * 177 * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. 178 * 179 * The function returns an error code, with 0 meaning OK, and any other value 180 * meaning there is an error. 181 * 182 * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. 183 * This function returns the nn-bits hash as an int or long long. 184 * 185 * It's still possible to continue inserting input into the hash state after a 186 * digest, and generate new hash values later on by invoking `XXH*_digest()`. 187 * 188 * When done, release the state using `XXH*_freeState()`. 189 * 190 * 191 * @anchor canonical_representation_example 192 * **Canonical Representation** 193 * 194 * The default return values from XXH functions are unsigned 32, 64 and 128 bit 195 * integers. 196 * This the simplest and fastest format for further post-processing. 197 * 198 * However, this leaves open the question of what is the order on the byte level, 199 * since little and big endian conventions will store the same number differently. 200 * 201 * The canonical representation settles this issue by mandating big-endian 202 * convention, the same convention as human-readable numbers (large digits first). 203 * 204 * When writing hash values to storage, sending them over a network, or printing 205 * them, it's highly recommended to use the canonical representation to ensure 206 * portability across a wider range of systems, present and future. 207 * 208 * The following functions allow transformation of hash values to and from 209 * canonical format. 210 * 211 * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), 212 * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), 213 * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), 214 * 215 * @code{.c} 216 * #include <stdio.h> 217 * #include "xxhash.h" 218 * 219 * // Example for a function which prints XXH32_hash_t in human readable format 220 * void printXxh32(XXH32_hash_t hash) 221 * { 222 * XXH32_canonical_t cano; 223 * XXH32_canonicalFromHash(&cano, hash); 224 * size_t i; 225 * for(i = 0; i < sizeof(cano.digest); ++i) { 226 * printf("%02x", cano.digest[i]); 227 * } 228 * printf("\n"); 229 * } 230 * 231 * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t 232 * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) 233 * { 234 * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); 235 * return hash; 236 * } 237 * @endcode 238 * 239 * 240 * @file xxhash.h 241 * xxHash prototypes and implementation 242 */ 243 244 #if defined(__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD) 245 extern "C" { 246 #endif 247 248 /* **************************** 249 * INLINE mode 250 ******************************/ 251 /*! 252 * @defgroup public Public API 253 * Contains details on the public xxHash functions. 254 * @{ 255 */ 256 #ifdef XXH_DOXYGEN 257 /*! 258 * @brief Gives access to internal state declaration, required for static allocation. 259 * 260 * Incompatible with dynamic linking, due to risks of ABI changes. 261 * 262 * Usage: 263 * @code{.c} 264 * #define XXH_STATIC_LINKING_ONLY 265 * #include "xxhash.h" 266 * @endcode 267 */ 268 # define XXH_STATIC_LINKING_ONLY 269 /* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ 270 271 /*! 272 * @brief Gives access to internal definitions. 273 * 274 * Usage: 275 * @code{.c} 276 * #define XXH_STATIC_LINKING_ONLY 277 * #define XXH_IMPLEMENTATION 278 * #include "xxhash.h" 279 * @endcode 280 */ 281 # define XXH_IMPLEMENTATION 282 /* Do not undef XXH_IMPLEMENTATION for Doxygen */ 283 284 /*! 285 * @brief Exposes the implementation and marks all functions as `inline`. 286 * 287 * Use these build macros to inline xxhash into the target unit. 288 * Inlining improves performance on small inputs, especially when the length is 289 * expressed as a compile-time constant: 290 * 291 * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html 292 * 293 * It also keeps xxHash symbols private to the unit, so they are not exported. 294 * 295 * Usage: 296 * @code{.c} 297 * #define XXH_INLINE_ALL 298 * #include "xxhash.h" 299 * @endcode 300 * Do not compile and link xxhash.o as a separate object, as it is not useful. 301 */ 302 # define XXH_INLINE_ALL 303 # undef XXH_INLINE_ALL 304 /*! 305 * @brief Exposes the implementation without marking functions as inline. 306 */ 307 # define XXH_PRIVATE_API 308 # undef XXH_PRIVATE_API 309 /*! 310 * @brief Emulate a namespace by transparently prefixing all symbols. 311 * 312 * If you want to include _and expose_ xxHash functions from within your own 313 * library, but also want to avoid symbol collisions with other libraries which 314 * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix 315 * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE 316 * (therefore, avoid empty or numeric values). 317 * 318 * Note that no change is required within the calling program as long as it 319 * includes `xxhash.h`: Regular symbol names will be automatically translated 320 * by this header. 321 */ 322 # define XXH_NAMESPACE /* YOUR NAME HERE */ 323 # undef XXH_NAMESPACE 324 #endif 325 326 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ 327 && !defined(XXH_INLINE_ALL_31684351384) 328 /* this section should be traversed only once */ 329 # define XXH_INLINE_ALL_31684351384 330 /* give access to the advanced API, required to compile implementations */ 331 # undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ 332 # define XXH_STATIC_LINKING_ONLY 333 /* make all functions private */ 334 # undef XXH_PUBLIC_API 335 # if defined(__GNUC__) 336 # define XXH_PUBLIC_API static __inline __attribute__((__unused__)) 337 # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) 338 # define XXH_PUBLIC_API static inline 339 # elif defined(_MSC_VER) 340 # define XXH_PUBLIC_API static __inline 341 # else 342 /* note: this version may generate warnings for unused static functions */ 343 # define XXH_PUBLIC_API static 344 # endif 345 346 /* 347 * This part deals with the special case where a unit wants to inline xxHash, 348 * but "xxhash.h" has previously been included without XXH_INLINE_ALL, 349 * such as part of some previously included *.h header file. 350 * Without further action, the new include would just be ignored, 351 * and functions would effectively _not_ be inlined (silent failure). 352 * The following macros solve this situation by prefixing all inlined names, 353 * avoiding naming collision with previous inclusions. 354 */ 355 /* Before that, we unconditionally #undef all symbols, 356 * in case they were already defined with XXH_NAMESPACE. 357 * They will then be redefined for XXH_INLINE_ALL 358 */ 359 # undef XXH_versionNumber 360 /* XXH32 */ 361 # undef XXH32 362 # undef XXH32_createState 363 # undef XXH32_freeState 364 # undef XXH32_reset 365 # undef XXH32_update 366 # undef XXH32_digest 367 # undef XXH32_copyState 368 # undef XXH32_canonicalFromHash 369 # undef XXH32_hashFromCanonical 370 /* XXH64 */ 371 # undef XXH64 372 # undef XXH64_createState 373 # undef XXH64_freeState 374 # undef XXH64_reset 375 # undef XXH64_update 376 # undef XXH64_digest 377 # undef XXH64_copyState 378 # undef XXH64_canonicalFromHash 379 # undef XXH64_hashFromCanonical 380 /* XXH3_64bits */ 381 # undef XXH3_64bits 382 # undef XXH3_64bits_withSecret 383 # undef XXH3_64bits_withSeed 384 # undef XXH3_64bits_withSecretandSeed 385 # undef XXH3_createState 386 # undef XXH3_freeState 387 # undef XXH3_copyState 388 # undef XXH3_64bits_reset 389 # undef XXH3_64bits_reset_withSeed 390 # undef XXH3_64bits_reset_withSecret 391 # undef XXH3_64bits_update 392 # undef XXH3_64bits_digest 393 # undef XXH3_generateSecret 394 /* XXH3_128bits */ 395 # undef XXH128 396 # undef XXH3_128bits 397 # undef XXH3_128bits_withSeed 398 # undef XXH3_128bits_withSecret 399 # undef XXH3_128bits_reset 400 # undef XXH3_128bits_reset_withSeed 401 # undef XXH3_128bits_reset_withSecret 402 # undef XXH3_128bits_reset_withSecretandSeed 403 # undef XXH3_128bits_update 404 # undef XXH3_128bits_digest 405 # undef XXH128_isEqual 406 # undef XXH128_cmp 407 # undef XXH128_canonicalFromHash 408 # undef XXH128_hashFromCanonical 409 /* Finally, free the namespace itself */ 410 # undef XXH_NAMESPACE 411 412 /* employ the namespace for XXH_INLINE_ALL */ 413 # define XXH_NAMESPACE XXH_INLINE_ 414 /* 415 * Some identifiers (enums, type names) are not symbols, 416 * but they must nonetheless be renamed to avoid redeclaration. 417 * Alternative solution: do not redeclare them. 418 * However, this requires some #ifdefs, and has a more dispersed impact. 419 * Meanwhile, renaming can be achieved in a single place. 420 */ 421 # define XXH_IPREF(Id) XXH_NAMESPACE ## Id 422 # define XXH_OK XXH_IPREF(XXH_OK) 423 # define XXH_ERROR XXH_IPREF(XXH_ERROR) 424 # define XXH_errorcode XXH_IPREF(XXH_errorcode) 425 # define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) 426 # define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) 427 # define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) 428 # define XXH32_state_s XXH_IPREF(XXH32_state_s) 429 # define XXH32_state_t XXH_IPREF(XXH32_state_t) 430 # define XXH64_state_s XXH_IPREF(XXH64_state_s) 431 # define XXH64_state_t XXH_IPREF(XXH64_state_t) 432 # define XXH3_state_s XXH_IPREF(XXH3_state_s) 433 # define XXH3_state_t XXH_IPREF(XXH3_state_t) 434 # define XXH128_hash_t XXH_IPREF(XXH128_hash_t) 435 /* Ensure the header is parsed again, even if it was previously included */ 436 # undef XXHASH_H_5627135585666179 437 # undef XXHASH_H_STATIC_13879238742 438 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ 439 440 /* **************************************************************** 441 * Stable API 442 *****************************************************************/ 443 #ifndef XXHASH_H_5627135585666179 444 #define XXHASH_H_5627135585666179 1 445 446 /*! @brief Marks a global symbol. */ 447 #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) 448 # if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) 449 # ifdef XXH_EXPORT 450 # define XXH_PUBLIC_API __declspec(dllexport) 451 # elif XXH_IMPORT 452 # define XXH_PUBLIC_API __declspec(dllimport) 453 # endif 454 # else 455 # define XXH_PUBLIC_API /* do nothing */ 456 # endif 457 #endif 458 459 #ifdef XXH_NAMESPACE 460 # define XXH_CAT(A,B) A##B 461 # define XXH_NAME2(A,B) XXH_CAT(A,B) 462 # define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) 463 /* XXH32 */ 464 # define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) 465 # define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) 466 # define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) 467 # define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) 468 # define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) 469 # define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) 470 # define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) 471 # define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) 472 # define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) 473 /* XXH64 */ 474 # define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) 475 # define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) 476 # define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) 477 # define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) 478 # define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) 479 # define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) 480 # define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) 481 # define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) 482 # define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) 483 /* XXH3_64bits */ 484 # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) 485 # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) 486 # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) 487 # define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) 488 # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) 489 # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) 490 # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) 491 # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) 492 # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) 493 # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) 494 # define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) 495 # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) 496 # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) 497 # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) 498 # define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) 499 /* XXH3_128bits */ 500 # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) 501 # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) 502 # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) 503 # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) 504 # define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) 505 # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) 506 # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) 507 # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) 508 # define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) 509 # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) 510 # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) 511 # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) 512 # define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) 513 # define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) 514 # define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) 515 #endif 516 517 518 /* ************************************* 519 * Compiler specifics 520 ***************************************/ 521 522 /* specific declaration modes for Windows */ 523 #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) 524 # if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) 525 # ifdef XXH_EXPORT 526 # define XXH_PUBLIC_API __declspec(dllexport) 527 # elif XXH_IMPORT 528 # define XXH_PUBLIC_API __declspec(dllimport) 529 # endif 530 # else 531 # define XXH_PUBLIC_API /* do nothing */ 532 # endif 533 #endif 534 535 #if defined (__GNUC__) 536 # define XXH_CONSTF __attribute__((__const__)) 537 # define XXH_PUREF __attribute__((__pure__)) 538 # define XXH_MALLOCF __attribute__((__malloc__)) 539 #else 540 # define XXH_CONSTF /* disable */ 541 # define XXH_PUREF 542 # define XXH_MALLOCF 543 #endif 544 545 /* ************************************* 546 * Version 547 ***************************************/ 548 #define XXH_VERSION_MAJOR 0 549 #define XXH_VERSION_MINOR 8 550 #define XXH_VERSION_RELEASE 3 551 /*! @brief Version number, encoded as two digits each */ 552 #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) 553 554 /*! 555 * @brief Obtains the xxHash version. 556 * 557 * This is mostly useful when xxHash is compiled as a shared library, 558 * since the returned value comes from the library, as opposed to header file. 559 * 560 * @return @ref XXH_VERSION_NUMBER of the invoked library. 561 */ 562 XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); 563 564 565 /* **************************** 566 * Common basic types 567 ******************************/ 568 #include <stddef.h> /* size_t */ 569 /*! 570 * @brief Exit code for the streaming API. 571 */ 572 typedef enum { 573 XXH_OK = 0, /*!< OK */ 574 XXH_ERROR /*!< Error */ 575 } XXH_errorcode; 576 577 578 /*-********************************************************************** 579 * 32-bit hash 580 ************************************************************************/ 581 #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ 582 /*! 583 * @brief An unsigned 32-bit integer. 584 * 585 * Not necessarily defined to `uint32_t` but functionally equivalent. 586 */ 587 typedef uint32_t XXH32_hash_t; 588 589 #elif !defined (__VMS) \ 590 && (defined (__cplusplus) \ 591 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) 592 # ifdef _AIX 593 # include <inttypes.h> 594 # else 595 # include <stdint.h> 596 # endif 597 typedef uint32_t XXH32_hash_t; 598 599 #else 600 # include <limits.h> 601 # if UINT_MAX == 0xFFFFFFFFUL 602 typedef unsigned int XXH32_hash_t; 603 # elif ULONG_MAX == 0xFFFFFFFFUL 604 typedef unsigned long XXH32_hash_t; 605 # else 606 # error "unsupported platform: need a 32-bit type" 607 # endif 608 #endif 609 610 /*! 611 * @} 612 * 613 * @defgroup XXH32_family XXH32 family 614 * @ingroup public 615 * Contains functions used in the classic 32-bit xxHash algorithm. 616 * 617 * @note 618 * XXH32 is useful for older platforms, with no or poor 64-bit performance. 619 * Note that the @ref XXH3_family provides competitive speed for both 32-bit 620 * and 64-bit systems, and offers true 64/128 bit hash results. 621 * 622 * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families 623 * @see @ref XXH32_impl for implementation details 624 * @{ 625 */ 626 627 /*! 628 * @brief Calculates the 32-bit hash of @p input using xxHash32. 629 * 630 * @param input The block of data to be hashed, at least @p length bytes in size. 631 * @param length The length of @p input, in bytes. 632 * @param seed The 32-bit seed to alter the hash's output predictably. 633 * 634 * @pre 635 * The memory between @p input and @p input + @p length must be valid, 636 * readable, contiguous memory. However, if @p length is `0`, @p input may be 637 * `NULL`. In C++, this also must be *TriviallyCopyable*. 638 * 639 * @return The calculated 32-bit xxHash32 value. 640 * 641 * @see @ref single_shot_example "Single Shot Example" for an example. 642 */ 643 XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); 644 645 #ifndef XXH_NO_STREAM 646 /*! 647 * @typedef struct XXH32_state_s XXH32_state_t 648 * @brief The opaque state struct for the XXH32 streaming API. 649 * 650 * @see XXH32_state_s for details. 651 * @see @ref streaming_example "Streaming Example" 652 */ 653 typedef struct XXH32_state_s XXH32_state_t; 654 655 /*! 656 * @brief Allocates an @ref XXH32_state_t. 657 * 658 * @return An allocated pointer of @ref XXH32_state_t on success. 659 * @return `NULL` on failure. 660 * 661 * @note Must be freed with XXH32_freeState(). 662 * 663 * @see @ref streaming_example "Streaming Example" 664 */ 665 XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); 666 /*! 667 * @brief Frees an @ref XXH32_state_t. 668 * 669 * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). 670 * 671 * @return @ref XXH_OK. 672 * 673 * @note @p statePtr must be allocated with XXH32_createState(). 674 * 675 * @see @ref streaming_example "Streaming Example" 676 * 677 */ 678 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); 679 /*! 680 * @brief Copies one @ref XXH32_state_t to another. 681 * 682 * @param dst_state The state to copy to. 683 * @param src_state The state to copy from. 684 * @pre 685 * @p dst_state and @p src_state must not be `NULL` and must not overlap. 686 */ 687 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); 688 689 /*! 690 * @brief Resets an @ref XXH32_state_t to begin a new hash. 691 * 692 * @param statePtr The state struct to reset. 693 * @param seed The 32-bit seed to alter the hash result predictably. 694 * 695 * @pre 696 * @p statePtr must not be `NULL`. 697 * 698 * @return @ref XXH_OK on success. 699 * @return @ref XXH_ERROR on failure. 700 * 701 * @note This function resets and seeds a state. Call it before @ref XXH32_update(). 702 * 703 * @see @ref streaming_example "Streaming Example" 704 */ 705 XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); 706 707 /*! 708 * @brief Consumes a block of @p input to an @ref XXH32_state_t. 709 * 710 * @param statePtr The state struct to update. 711 * @param input The block of data to be hashed, at least @p length bytes in size. 712 * @param length The length of @p input, in bytes. 713 * 714 * @pre 715 * @p statePtr must not be `NULL`. 716 * @pre 717 * The memory between @p input and @p input + @p length must be valid, 718 * readable, contiguous memory. However, if @p length is `0`, @p input may be 719 * `NULL`. In C++, this also must be *TriviallyCopyable*. 720 * 721 * @return @ref XXH_OK on success. 722 * @return @ref XXH_ERROR on failure. 723 * 724 * @note Call this to incrementally consume blocks of data. 725 * 726 * @see @ref streaming_example "Streaming Example" 727 */ 728 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); 729 730 /*! 731 * @brief Returns the calculated hash value from an @ref XXH32_state_t. 732 * 733 * @param statePtr The state struct to calculate the hash from. 734 * 735 * @pre 736 * @p statePtr must not be `NULL`. 737 * 738 * @return The calculated 32-bit xxHash32 value from that state. 739 * 740 * @note 741 * Calling XXH32_digest() will not affect @p statePtr, so you can update, 742 * digest, and update again. 743 * 744 * @see @ref streaming_example "Streaming Example" 745 */ 746 XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); 747 #endif /* !XXH_NO_STREAM */ 748 749 /******* Canonical representation *******/ 750 751 /*! 752 * @brief Canonical (big endian) representation of @ref XXH32_hash_t. 753 */ 754 typedef struct { 755 unsigned char digest[4]; /*!< Hash bytes, big endian */ 756 } XXH32_canonical_t; 757 758 /*! 759 * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. 760 * 761 * @param dst The @ref XXH32_canonical_t pointer to be stored to. 762 * @param hash The @ref XXH32_hash_t to be converted. 763 * 764 * @pre 765 * @p dst must not be `NULL`. 766 * 767 * @see @ref canonical_representation_example "Canonical Representation Example" 768 */ 769 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); 770 771 /*! 772 * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. 773 * 774 * @param src The @ref XXH32_canonical_t to convert. 775 * 776 * @pre 777 * @p src must not be `NULL`. 778 * 779 * @return The converted hash. 780 * 781 * @see @ref canonical_representation_example "Canonical Representation Example" 782 */ 783 XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); 784 785 786 /*! @cond Doxygen ignores this part */ 787 #ifdef __has_attribute 788 # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) 789 #else 790 # define XXH_HAS_ATTRIBUTE(x) 0 791 #endif 792 /*! @endcond */ 793 794 /*! @cond Doxygen ignores this part */ 795 /* C-language Attributes are added in C23. */ 796 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L) && defined(__has_c_attribute) 797 # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) 798 #else 799 # define XXH_HAS_C_ATTRIBUTE(x) 0 800 #endif 801 /*! @endcond */ 802 803 /*! @cond Doxygen ignores this part */ 804 #if defined(__cplusplus) && defined(__has_cpp_attribute) 805 # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) 806 #else 807 # define XXH_HAS_CPP_ATTRIBUTE(x) 0 808 #endif 809 /*! @endcond */ 810 811 /*! @cond Doxygen ignores this part */ 812 /* 813 * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute 814 * introduced in CPP17 and C23. 815 * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough 816 * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough 817 */ 818 #if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) 819 # define XXH_FALLTHROUGH [[fallthrough]] 820 #elif XXH_HAS_ATTRIBUTE(__fallthrough__) 821 # define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) 822 #else 823 # define XXH_FALLTHROUGH /* fallthrough */ 824 #endif 825 /*! @endcond */ 826 827 /*! @cond Doxygen ignores this part */ 828 /* 829 * Define XXH_NOESCAPE for annotated pointers in public API. 830 * https://clang.llvm.org/docs/AttributeReference.html#noescape 831 * As of writing this, only supported by clang. 832 */ 833 #if XXH_HAS_ATTRIBUTE(noescape) 834 # define XXH_NOESCAPE __attribute__((__noescape__)) 835 #else 836 # define XXH_NOESCAPE 837 #endif 838 /*! @endcond */ 839 840 841 /*! 842 * @} 843 * @ingroup public 844 * @{ 845 */ 846 847 #ifndef XXH_NO_LONG_LONG 848 /*-********************************************************************** 849 * 64-bit hash 850 ************************************************************************/ 851 #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ 852 /*! 853 * @brief An unsigned 64-bit integer. 854 * 855 * Not necessarily defined to `uint64_t` but functionally equivalent. 856 */ 857 typedef uint64_t XXH64_hash_t; 858 #elif !defined (__VMS) \ 859 && (defined (__cplusplus) \ 860 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) 861 # ifdef _AIX 862 # include <inttypes.h> 863 # else 864 # include <stdint.h> 865 # endif 866 typedef uint64_t XXH64_hash_t; 867 #else 868 # include <limits.h> 869 # if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL 870 /* LP64 ABI says uint64_t is unsigned long */ 871 typedef unsigned long XXH64_hash_t; 872 # else 873 /* the following type must have a width of 64-bit */ 874 typedef unsigned long long XXH64_hash_t; 875 # endif 876 #endif 877 878 /*! 879 * @} 880 * 881 * @defgroup XXH64_family XXH64 family 882 * @ingroup public 883 * @{ 884 * Contains functions used in the classic 64-bit xxHash algorithm. 885 * 886 * @note 887 * XXH3 provides competitive speed for both 32-bit and 64-bit systems, 888 * and offers true 64/128 bit hash results. 889 * It provides better speed for systems with vector processing capabilities. 890 */ 891 892 /*! 893 * @brief Calculates the 64-bit hash of @p input using xxHash64. 894 * 895 * @param input The block of data to be hashed, at least @p length bytes in size. 896 * @param length The length of @p input, in bytes. 897 * @param seed The 64-bit seed to alter the hash's output predictably. 898 * 899 * @pre 900 * The memory between @p input and @p input + @p length must be valid, 901 * readable, contiguous memory. However, if @p length is `0`, @p input may be 902 * `NULL`. In C++, this also must be *TriviallyCopyable*. 903 * 904 * @return The calculated 64-bit xxHash64 value. 905 * 906 * @see @ref single_shot_example "Single Shot Example" for an example. 907 */ 908 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); 909 910 /******* Streaming *******/ 911 #ifndef XXH_NO_STREAM 912 /*! 913 * @brief The opaque state struct for the XXH64 streaming API. 914 * 915 * @see XXH64_state_s for details. 916 * @see @ref streaming_example "Streaming Example" 917 */ 918 typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ 919 920 /*! 921 * @brief Allocates an @ref XXH64_state_t. 922 * 923 * @return An allocated pointer of @ref XXH64_state_t on success. 924 * @return `NULL` on failure. 925 * 926 * @note Must be freed with XXH64_freeState(). 927 * 928 * @see @ref streaming_example "Streaming Example" 929 */ 930 XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); 931 932 /*! 933 * @brief Frees an @ref XXH64_state_t. 934 * 935 * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). 936 * 937 * @return @ref XXH_OK. 938 * 939 * @note @p statePtr must be allocated with XXH64_createState(). 940 * 941 * @see @ref streaming_example "Streaming Example" 942 */ 943 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); 944 945 /*! 946 * @brief Copies one @ref XXH64_state_t to another. 947 * 948 * @param dst_state The state to copy to. 949 * @param src_state The state to copy from. 950 * @pre 951 * @p dst_state and @p src_state must not be `NULL` and must not overlap. 952 */ 953 XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); 954 955 /*! 956 * @brief Resets an @ref XXH64_state_t to begin a new hash. 957 * 958 * @param statePtr The state struct to reset. 959 * @param seed The 64-bit seed to alter the hash result predictably. 960 * 961 * @pre 962 * @p statePtr must not be `NULL`. 963 * 964 * @return @ref XXH_OK on success. 965 * @return @ref XXH_ERROR on failure. 966 * 967 * @note This function resets and seeds a state. Call it before @ref XXH64_update(). 968 * 969 * @see @ref streaming_example "Streaming Example" 970 */ 971 XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); 972 973 /*! 974 * @brief Consumes a block of @p input to an @ref XXH64_state_t. 975 * 976 * @param statePtr The state struct to update. 977 * @param input The block of data to be hashed, at least @p length bytes in size. 978 * @param length The length of @p input, in bytes. 979 * 980 * @pre 981 * @p statePtr must not be `NULL`. 982 * @pre 983 * The memory between @p input and @p input + @p length must be valid, 984 * readable, contiguous memory. However, if @p length is `0`, @p input may be 985 * `NULL`. In C++, this also must be *TriviallyCopyable*. 986 * 987 * @return @ref XXH_OK on success. 988 * @return @ref XXH_ERROR on failure. 989 * 990 * @note Call this to incrementally consume blocks of data. 991 * 992 * @see @ref streaming_example "Streaming Example" 993 */ 994 XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); 995 996 /*! 997 * @brief Returns the calculated hash value from an @ref XXH64_state_t. 998 * 999 * @param statePtr The state struct to calculate the hash from. 1000 * 1001 * @pre 1002 * @p statePtr must not be `NULL`. 1003 * 1004 * @return The calculated 64-bit xxHash64 value from that state. 1005 * 1006 * @note 1007 * Calling XXH64_digest() will not affect @p statePtr, so you can update, 1008 * digest, and update again. 1009 * 1010 * @see @ref streaming_example "Streaming Example" 1011 */ 1012 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); 1013 #endif /* !XXH_NO_STREAM */ 1014 /******* Canonical representation *******/ 1015 1016 /*! 1017 * @brief Canonical (big endian) representation of @ref XXH64_hash_t. 1018 */ 1019 typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; 1020 1021 /*! 1022 * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. 1023 * 1024 * @param dst The @ref XXH64_canonical_t pointer to be stored to. 1025 * @param hash The @ref XXH64_hash_t to be converted. 1026 * 1027 * @pre 1028 * @p dst must not be `NULL`. 1029 * 1030 * @see @ref canonical_representation_example "Canonical Representation Example" 1031 */ 1032 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); 1033 1034 /*! 1035 * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. 1036 * 1037 * @param src The @ref XXH64_canonical_t to convert. 1038 * 1039 * @pre 1040 * @p src must not be `NULL`. 1041 * 1042 * @return The converted hash. 1043 * 1044 * @see @ref canonical_representation_example "Canonical Representation Example" 1045 */ 1046 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); 1047 1048 #ifndef XXH_NO_XXH3 1049 1050 /*! 1051 * @} 1052 * ************************************************************************ 1053 * @defgroup XXH3_family XXH3 family 1054 * @ingroup public 1055 * @{ 1056 * 1057 * XXH3 is a more recent hash algorithm featuring: 1058 * - Improved speed for both small and large inputs 1059 * - True 64-bit and 128-bit outputs 1060 * - SIMD acceleration 1061 * - Improved 32-bit viability 1062 * 1063 * Speed analysis methodology is explained here: 1064 * 1065 * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html 1066 * 1067 * Compared to XXH64, expect XXH3 to run approximately 1068 * ~2x faster on large inputs and >3x faster on small ones, 1069 * exact differences vary depending on platform. 1070 * 1071 * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, 1072 * but does not require it. 1073 * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 1074 * at competitive speeds, even without vector support. Further details are 1075 * explained in the implementation. 1076 * 1077 * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD 1078 * implementations for many common platforms: 1079 * - AVX512 1080 * - AVX2 1081 * - SSE2 1082 * - ARM NEON 1083 * - WebAssembly SIMD128 1084 * - POWER8 VSX 1085 * - s390x ZVector 1086 * This can be controlled via the @ref XXH_VECTOR macro, but it automatically 1087 * selects the best version according to predefined macros. For the x86 family, an 1088 * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. 1089 * 1090 * XXH3 implementation is portable: 1091 * it has a generic C90 formulation that can be compiled on any platform, 1092 * all implementations generate exactly the same hash value on all platforms. 1093 * Starting from v0.8.0, it's also labelled "stable", meaning that 1094 * any future version will also generate the same hash value. 1095 * 1096 * XXH3 offers 2 variants, _64bits and _128bits. 1097 * 1098 * When only 64 bits are needed, prefer invoking the _64bits variant, as it 1099 * reduces the amount of mixing, resulting in faster speed on small inputs. 1100 * It's also generally simpler to manipulate a scalar return type than a struct. 1101 * 1102 * The API supports one-shot hashing, streaming mode, and custom secrets. 1103 */ 1104 1105 /*! 1106 * @ingroup tuning 1107 * @brief Possible values for @ref XXH_VECTOR. 1108 * 1109 * Unless set explicitly, determined automatically. 1110 */ 1111 # define XXH_SCALAR 0 /*!< Portable scalar version */ 1112 # define XXH_SSE2 1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */ 1113 # define XXH_AVX2 2 /*!< AVX2 for Haswell and Bulldozer */ 1114 # define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */ 1115 # define XXH_NEON 4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */ 1116 # define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */ 1117 # define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ 1118 # define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ 1119 # define XXH_LASX 8 /*!< LASX (256-bit SIMD) for LoongArch64 */ 1120 # define XXH_RVV 9 /*!< RVV (RISC-V Vector) for RISC-V */ 1121 1122 /*-********************************************************************** 1123 * XXH3 64-bit variant 1124 ************************************************************************/ 1125 1126 /*! 1127 * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. 1128 * 1129 * @param input The block of data to be hashed, at least @p length bytes in size. 1130 * @param length The length of @p input, in bytes. 1131 * 1132 * @pre 1133 * The memory between @p input and @p input + @p length must be valid, 1134 * readable, contiguous memory. However, if @p length is `0`, @p input may be 1135 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1136 * 1137 * @return The calculated 64-bit XXH3 hash value. 1138 * 1139 * @note 1140 * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however 1141 * it may have slightly better performance due to constant propagation of the 1142 * defaults. 1143 * 1144 * @see 1145 * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants 1146 * @see @ref single_shot_example "Single Shot Example" for an example. 1147 */ 1148 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); 1149 1150 /*! 1151 * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. 1152 * 1153 * @param input The block of data to be hashed, at least @p length bytes in size. 1154 * @param length The length of @p input, in bytes. 1155 * @param seed The 64-bit seed to alter the hash result predictably. 1156 * 1157 * @pre 1158 * The memory between @p input and @p input + @p length must be valid, 1159 * readable, contiguous memory. However, if @p length is `0`, @p input may be 1160 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1161 * 1162 * @return The calculated 64-bit XXH3 hash value. 1163 * 1164 * @note 1165 * seed == 0 produces the same results as @ref XXH3_64bits(). 1166 * 1167 * This variant generates a custom secret on the fly based on default secret 1168 * altered using the @p seed value. 1169 * 1170 * While this operation is decently fast, note that it's not completely free. 1171 * 1172 * @see @ref single_shot_example "Single Shot Example" for an example. 1173 */ 1174 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); 1175 1176 /*! 1177 * The bare minimum size for a custom secret. 1178 * 1179 * @see 1180 * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), 1181 * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). 1182 */ 1183 #define XXH3_SECRET_SIZE_MIN 136 1184 1185 /*! 1186 * @brief Calculates 64-bit variant of XXH3 with a custom "secret". 1187 * 1188 * @param data The block of data to be hashed, at least @p len bytes in size. 1189 * @param len The length of @p data, in bytes. 1190 * @param secret The secret data. 1191 * @param secretSize The length of @p secret, in bytes. 1192 * 1193 * @return The calculated 64-bit XXH3 hash value. 1194 * 1195 * @pre 1196 * The memory between @p data and @p data + @p len must be valid, 1197 * readable, contiguous memory. However, if @p length is `0`, @p data may be 1198 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1199 * 1200 * It's possible to provide any blob of bytes as a "secret" to generate the hash. 1201 * This makes it more difficult for an external actor to prepare an intentional collision. 1202 * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). 1203 * However, the quality of the secret impacts the dispersion of the hash algorithm. 1204 * Therefore, the secret _must_ look like a bunch of random bytes. 1205 * Avoid "trivial" or structured data such as repeated sequences or a text document. 1206 * Whenever in doubt about the "randomness" of the blob of bytes, 1207 * consider employing @ref XXH3_generateSecret() instead (see below). 1208 * It will generate a proper high entropy secret derived from the blob of bytes. 1209 * Another advantage of using XXH3_generateSecret() is that 1210 * it guarantees that all bits within the initial blob of bytes 1211 * will impact every bit of the output. 1212 * This is not necessarily the case when using the blob of bytes directly 1213 * because, when hashing _small_ inputs, only a portion of the secret is employed. 1214 * 1215 * @see @ref single_shot_example "Single Shot Example" for an example. 1216 */ 1217 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); 1218 1219 1220 /******* Streaming *******/ 1221 #ifndef XXH_NO_STREAM 1222 /* 1223 * Streaming requires state maintenance. 1224 * This operation costs memory and CPU. 1225 * As a consequence, streaming is slower than one-shot hashing. 1226 * For better performance, prefer one-shot functions whenever applicable. 1227 */ 1228 1229 /*! 1230 * @brief The opaque state struct for the XXH3 streaming API. 1231 * 1232 * @see XXH3_state_s for details. 1233 * @see @ref streaming_example "Streaming Example" 1234 */ 1235 typedef struct XXH3_state_s XXH3_state_t; 1236 XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); 1237 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); 1238 1239 /*! 1240 * @brief Copies one @ref XXH3_state_t to another. 1241 * 1242 * @param dst_state The state to copy to. 1243 * @param src_state The state to copy from. 1244 * @pre 1245 * @p dst_state and @p src_state must not be `NULL` and must not overlap. 1246 */ 1247 XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); 1248 1249 /*! 1250 * @brief Resets an @ref XXH3_state_t to begin a new hash. 1251 * 1252 * @param statePtr The state struct to reset. 1253 * 1254 * @pre 1255 * @p statePtr must not be `NULL`. 1256 * 1257 * @return @ref XXH_OK on success. 1258 * @return @ref XXH_ERROR on failure. 1259 * 1260 * @note 1261 * - This function resets `statePtr` and generate a secret with default parameters. 1262 * - Call this function before @ref XXH3_64bits_update(). 1263 * - Digest will be equivalent to `XXH3_64bits()`. 1264 * 1265 * @see @ref streaming_example "Streaming Example" 1266 * 1267 */ 1268 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); 1269 1270 /*! 1271 * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. 1272 * 1273 * @param statePtr The state struct to reset. 1274 * @param seed The 64-bit seed to alter the hash result predictably. 1275 * 1276 * @pre 1277 * @p statePtr must not be `NULL`. 1278 * 1279 * @return @ref XXH_OK on success. 1280 * @return @ref XXH_ERROR on failure. 1281 * 1282 * @note 1283 * - This function resets `statePtr` and generate a secret from `seed`. 1284 * - Call this function before @ref XXH3_64bits_update(). 1285 * - Digest will be equivalent to `XXH3_64bits_withSeed()`. 1286 * 1287 * @see @ref streaming_example "Streaming Example" 1288 * 1289 */ 1290 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); 1291 1292 /*! 1293 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. 1294 * 1295 * @param statePtr The state struct to reset. 1296 * @param secret The secret data. 1297 * @param secretSize The length of @p secret, in bytes. 1298 * 1299 * @pre 1300 * @p statePtr must not be `NULL`. 1301 * 1302 * @return @ref XXH_OK on success. 1303 * @return @ref XXH_ERROR on failure. 1304 * 1305 * @note 1306 * `secret` is referenced, it _must outlive_ the hash streaming session. 1307 * 1308 * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, 1309 * and the quality of produced hash values depends on secret's entropy 1310 * (secret's content should look like a bunch of random bytes). 1311 * When in doubt about the randomness of a candidate `secret`, 1312 * consider employing `XXH3_generateSecret()` instead (see below). 1313 * 1314 * @see @ref streaming_example "Streaming Example" 1315 */ 1316 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); 1317 1318 /*! 1319 * @brief Consumes a block of @p input to an @ref XXH3_state_t. 1320 * 1321 * @param statePtr The state struct to update. 1322 * @param input The block of data to be hashed, at least @p length bytes in size. 1323 * @param length The length of @p input, in bytes. 1324 * 1325 * @pre 1326 * @p statePtr must not be `NULL`. 1327 * @pre 1328 * The memory between @p input and @p input + @p length must be valid, 1329 * readable, contiguous memory. However, if @p length is `0`, @p input may be 1330 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1331 * 1332 * @return @ref XXH_OK on success. 1333 * @return @ref XXH_ERROR on failure. 1334 * 1335 * @note Call this to incrementally consume blocks of data. 1336 * 1337 * @see @ref streaming_example "Streaming Example" 1338 */ 1339 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); 1340 1341 /*! 1342 * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. 1343 * 1344 * @param statePtr The state struct to calculate the hash from. 1345 * 1346 * @pre 1347 * @p statePtr must not be `NULL`. 1348 * 1349 * @return The calculated XXH3 64-bit hash value from that state. 1350 * 1351 * @note 1352 * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, 1353 * digest, and update again. 1354 * 1355 * @see @ref streaming_example "Streaming Example" 1356 */ 1357 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); 1358 #endif /* !XXH_NO_STREAM */ 1359 1360 /* note : canonical representation of XXH3 is the same as XXH64 1361 * since they both produce XXH64_hash_t values */ 1362 1363 1364 /*-********************************************************************** 1365 * XXH3 128-bit variant 1366 ************************************************************************/ 1367 1368 /*! 1369 * @brief The return value from 128-bit hashes. 1370 * 1371 * Stored in little endian order, although the fields themselves are in native 1372 * endianness. 1373 */ 1374 typedef struct { 1375 XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ 1376 XXH64_hash_t high64; /*!< `value >> 64` */ 1377 } XXH128_hash_t; 1378 1379 /*! 1380 * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. 1381 * 1382 * @param data The block of data to be hashed, at least @p length bytes in size. 1383 * @param len The length of @p data, in bytes. 1384 * 1385 * @return The calculated 128-bit variant of XXH3 value. 1386 * 1387 * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead 1388 * for shorter inputs. 1389 * 1390 * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however 1391 * it may have slightly better performance due to constant propagation of the 1392 * defaults. 1393 * 1394 * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants 1395 * @see @ref single_shot_example "Single Shot Example" for an example. 1396 */ 1397 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); 1398 /*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. 1399 * 1400 * @param data The block of data to be hashed, at least @p length bytes in size. 1401 * @param len The length of @p data, in bytes. 1402 * @param seed The 64-bit seed to alter the hash result predictably. 1403 * 1404 * @return The calculated 128-bit variant of XXH3 value. 1405 * 1406 * @note 1407 * seed == 0 produces the same results as @ref XXH3_64bits(). 1408 * 1409 * This variant generates a custom secret on the fly based on default secret 1410 * altered using the @p seed value. 1411 * 1412 * While this operation is decently fast, note that it's not completely free. 1413 * 1414 * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants 1415 * @see @ref single_shot_example "Single Shot Example" for an example. 1416 */ 1417 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); 1418 /*! 1419 * @brief Calculates 128-bit variant of XXH3 with a custom "secret". 1420 * 1421 * @param data The block of data to be hashed, at least @p len bytes in size. 1422 * @param len The length of @p data, in bytes. 1423 * @param secret The secret data. 1424 * @param secretSize The length of @p secret, in bytes. 1425 * 1426 * @return The calculated 128-bit variant of XXH3 value. 1427 * 1428 * It's possible to provide any blob of bytes as a "secret" to generate the hash. 1429 * This makes it more difficult for an external actor to prepare an intentional collision. 1430 * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). 1431 * However, the quality of the secret impacts the dispersion of the hash algorithm. 1432 * Therefore, the secret _must_ look like a bunch of random bytes. 1433 * Avoid "trivial" or structured data such as repeated sequences or a text document. 1434 * Whenever in doubt about the "randomness" of the blob of bytes, 1435 * consider employing @ref XXH3_generateSecret() instead (see below). 1436 * It will generate a proper high entropy secret derived from the blob of bytes. 1437 * Another advantage of using XXH3_generateSecret() is that 1438 * it guarantees that all bits within the initial blob of bytes 1439 * will impact every bit of the output. 1440 * This is not necessarily the case when using the blob of bytes directly 1441 * because, when hashing _small_ inputs, only a portion of the secret is employed. 1442 * 1443 * @see @ref single_shot_example "Single Shot Example" for an example. 1444 */ 1445 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); 1446 1447 /******* Streaming *******/ 1448 #ifndef XXH_NO_STREAM 1449 /* 1450 * Streaming requires state maintenance. 1451 * This operation costs memory and CPU. 1452 * As a consequence, streaming is slower than one-shot hashing. 1453 * For better performance, prefer one-shot functions whenever applicable. 1454 * 1455 * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). 1456 * Use already declared XXH3_createState() and XXH3_freeState(). 1457 * 1458 * All reset and streaming functions have same meaning as their 64-bit counterpart. 1459 */ 1460 1461 /*! 1462 * @brief Resets an @ref XXH3_state_t to begin a new hash. 1463 * 1464 * @param statePtr The state struct to reset. 1465 * 1466 * @pre 1467 * @p statePtr must not be `NULL`. 1468 * 1469 * @return @ref XXH_OK on success. 1470 * @return @ref XXH_ERROR on failure. 1471 * 1472 * @note 1473 * - This function resets `statePtr` and generate a secret with default parameters. 1474 * - Call it before @ref XXH3_128bits_update(). 1475 * - Digest will be equivalent to `XXH3_128bits()`. 1476 * 1477 * @see @ref streaming_example "Streaming Example" 1478 */ 1479 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); 1480 1481 /*! 1482 * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. 1483 * 1484 * @param statePtr The state struct to reset. 1485 * @param seed The 64-bit seed to alter the hash result predictably. 1486 * 1487 * @pre 1488 * @p statePtr must not be `NULL`. 1489 * 1490 * @return @ref XXH_OK on success. 1491 * @return @ref XXH_ERROR on failure. 1492 * 1493 * @note 1494 * - This function resets `statePtr` and generate a secret from `seed`. 1495 * - Call it before @ref XXH3_128bits_update(). 1496 * - Digest will be equivalent to `XXH3_128bits_withSeed()`. 1497 * 1498 * @see @ref streaming_example "Streaming Example" 1499 */ 1500 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); 1501 /*! 1502 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. 1503 * 1504 * @param statePtr The state struct to reset. 1505 * @param secret The secret data. 1506 * @param secretSize The length of @p secret, in bytes. 1507 * 1508 * @pre 1509 * @p statePtr must not be `NULL`. 1510 * 1511 * @return @ref XXH_OK on success. 1512 * @return @ref XXH_ERROR on failure. 1513 * 1514 * `secret` is referenced, it _must outlive_ the hash streaming session. 1515 * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, 1516 * and the quality of produced hash values depends on secret's entropy 1517 * (secret's content should look like a bunch of random bytes). 1518 * When in doubt about the randomness of a candidate `secret`, 1519 * consider employing `XXH3_generateSecret()` instead (see below). 1520 * 1521 * @see @ref streaming_example "Streaming Example" 1522 */ 1523 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); 1524 1525 /*! 1526 * @brief Consumes a block of @p input to an @ref XXH3_state_t. 1527 * 1528 * Call this to incrementally consume blocks of data. 1529 * 1530 * @param statePtr The state struct to update. 1531 * @param input The block of data to be hashed, at least @p length bytes in size. 1532 * @param length The length of @p input, in bytes. 1533 * 1534 * @pre 1535 * @p statePtr must not be `NULL`. 1536 * 1537 * @return @ref XXH_OK on success. 1538 * @return @ref XXH_ERROR on failure. 1539 * 1540 * @note 1541 * The memory between @p input and @p input + @p length must be valid, 1542 * readable, contiguous memory. However, if @p length is `0`, @p input may be 1543 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1544 * 1545 */ 1546 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); 1547 1548 /*! 1549 * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. 1550 * 1551 * @param statePtr The state struct to calculate the hash from. 1552 * 1553 * @pre 1554 * @p statePtr must not be `NULL`. 1555 * 1556 * @return The calculated XXH3 128-bit hash value from that state. 1557 * 1558 * @note 1559 * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, 1560 * digest, and update again. 1561 * 1562 */ 1563 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); 1564 #endif /* !XXH_NO_STREAM */ 1565 1566 /* Following helper functions make it possible to compare XXH128_hast_t values. 1567 * Since XXH128_hash_t is a structure, this capability is not offered by the language. 1568 * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ 1569 1570 /*! 1571 * @brief Check equality of two XXH128_hash_t values 1572 * 1573 * @param h1 The 128-bit hash value. 1574 * @param h2 Another 128-bit hash value. 1575 * 1576 * @return `1` if `h1` and `h2` are equal. 1577 * @return `0` if they are not. 1578 */ 1579 XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); 1580 1581 /*! 1582 * @brief Compares two @ref XXH128_hash_t 1583 * 1584 * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. 1585 * 1586 * @param h128_1 Left-hand side value 1587 * @param h128_2 Right-hand side value 1588 * 1589 * @return >0 if @p h128_1 > @p h128_2 1590 * @return =0 if @p h128_1 == @p h128_2 1591 * @return <0 if @p h128_1 < @p h128_2 1592 */ 1593 XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); 1594 1595 1596 /******* Canonical representation *******/ 1597 typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; 1598 1599 1600 /*! 1601 * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. 1602 * 1603 * @param dst The @ref XXH128_canonical_t pointer to be stored to. 1604 * @param hash The @ref XXH128_hash_t to be converted. 1605 * 1606 * @pre 1607 * @p dst must not be `NULL`. 1608 * @see @ref canonical_representation_example "Canonical Representation Example" 1609 */ 1610 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); 1611 1612 /*! 1613 * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. 1614 * 1615 * @param src The @ref XXH128_canonical_t to convert. 1616 * 1617 * @pre 1618 * @p src must not be `NULL`. 1619 * 1620 * @return The converted hash. 1621 * @see @ref canonical_representation_example "Canonical Representation Example" 1622 */ 1623 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); 1624 1625 1626 #endif /* !XXH_NO_XXH3 */ 1627 #endif /* XXH_NO_LONG_LONG */ 1628 1629 /*! 1630 * @} 1631 */ 1632 #endif /* XXHASH_H_5627135585666179 */ 1633 1634 1635 1636 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) 1637 #define XXHASH_H_STATIC_13879238742 1638 /* **************************************************************************** 1639 * This section contains declarations which are not guaranteed to remain stable. 1640 * They may change in future versions, becoming incompatible with a different 1641 * version of the library. 1642 * These declarations should only be used with static linking. 1643 * Never use them in association with dynamic linking! 1644 ***************************************************************************** */ 1645 1646 /* 1647 * These definitions are only present to allow static allocation 1648 * of XXH states, on stack or in a struct, for example. 1649 * Never **ever** access their members directly. 1650 */ 1651 1652 /*! 1653 * @internal 1654 * @brief Structure for XXH32 streaming API. 1655 * 1656 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, 1657 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is 1658 * an opaque type. This allows fields to safely be changed. 1659 * 1660 * Typedef'd to @ref XXH32_state_t. 1661 * Do not access the members of this struct directly. 1662 * @see XXH64_state_s, XXH3_state_s 1663 */ 1664 struct XXH32_state_s { 1665 XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ 1666 XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ 1667 XXH32_hash_t acc[4]; /*!< Accumulator lanes */ 1668 unsigned char buffer[16]; /*!< Internal buffer for partial reads. */ 1669 XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ 1670 XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ 1671 }; /* typedef'd to XXH32_state_t */ 1672 1673 1674 #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ 1675 1676 /*! 1677 * @internal 1678 * @brief Structure for XXH64 streaming API. 1679 * 1680 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, 1681 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is 1682 * an opaque type. This allows fields to safely be changed. 1683 * 1684 * Typedef'd to @ref XXH64_state_t. 1685 * Do not access the members of this struct directly. 1686 * @see XXH32_state_s, XXH3_state_s 1687 */ 1688 struct XXH64_state_s { 1689 XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ 1690 XXH64_hash_t acc[4]; /*!< Accumulator lanes */ 1691 unsigned char buffer[32]; /*!< Internal buffer for partial reads.. */ 1692 XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ 1693 XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ 1694 XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ 1695 }; /* typedef'd to XXH64_state_t */ 1696 1697 #ifndef XXH_NO_XXH3 1698 1699 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ 1700 # define XXH_ALIGN(n) _Alignas(n) 1701 #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ 1702 /* In C++ alignas() is a keyword */ 1703 # define XXH_ALIGN(n) alignas(n) 1704 #elif defined(__GNUC__) 1705 # define XXH_ALIGN(n) __attribute__ ((aligned(n))) 1706 #elif defined(_MSC_VER) 1707 # define XXH_ALIGN(n) __declspec(align(n)) 1708 #else 1709 # define XXH_ALIGN(n) /* disabled */ 1710 #endif 1711 1712 /* Old GCC versions only accept the attribute after the type in structures. */ 1713 #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ 1714 && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ 1715 && defined(__GNUC__) 1716 # define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) 1717 #else 1718 # define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type 1719 #endif 1720 1721 /*! 1722 * @internal 1723 * @brief The size of the internal XXH3 buffer. 1724 * 1725 * This is the optimal update size for incremental hashing. 1726 * 1727 * @see XXH3_64b_update(), XXH3_128b_update(). 1728 */ 1729 #define XXH3_INTERNALBUFFER_SIZE 256 1730 1731 /*! 1732 * @def XXH3_SECRET_DEFAULT_SIZE 1733 * @brief Default Secret's size 1734 * 1735 * This is the size of internal XXH3_kSecret 1736 * and is needed by XXH3_generateSecret_fromSeed(). 1737 * 1738 * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. 1739 */ 1740 #define XXH3_SECRET_DEFAULT_SIZE 192 1741 1742 /*! 1743 * @internal 1744 * @brief Structure for XXH3 streaming API. 1745 * 1746 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, 1747 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. 1748 * Otherwise it is an opaque type. 1749 * Never use this definition in combination with dynamic library. 1750 * This allows fields to safely be changed in the future. 1751 * 1752 * @note ** This structure has a strict alignment requirement of 64 bytes!! ** 1753 * Do not allocate this with `malloc()` or `new`, 1754 * it will not be sufficiently aligned. 1755 * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. 1756 * 1757 * Typedef'd to @ref XXH3_state_t. 1758 * Do never access the members of this struct directly. 1759 * 1760 * @see XXH3_INITSTATE() for stack initialization. 1761 * @see XXH3_createState(), XXH3_freeState(). 1762 * @see XXH32_state_s, XXH64_state_s 1763 */ 1764 struct XXH3_state_s { 1765 XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); 1766 /*!< The 8 accumulators. See @ref XXH32_state_s::acc and @ref XXH64_state_s::acc */ 1767 XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); 1768 /*!< Used to store a custom secret generated from a seed. */ 1769 XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); 1770 /*!< The internal buffer. @see XXH32_state_s::mem32 */ 1771 XXH32_hash_t bufferedSize; 1772 /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ 1773 XXH32_hash_t useSeed; 1774 /*!< Reserved field. Needed for padding on 64-bit. */ 1775 size_t nbStripesSoFar; 1776 /*!< Number or stripes processed. */ 1777 XXH64_hash_t totalLen; 1778 /*!< Total length hashed. 64-bit even on 32-bit targets. */ 1779 size_t nbStripesPerBlock; 1780 /*!< Number of stripes per block. */ 1781 size_t secretLimit; 1782 /*!< Size of @ref customSecret or @ref extSecret */ 1783 XXH64_hash_t seed; 1784 /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ 1785 XXH64_hash_t reserved64; 1786 /*!< Reserved field. */ 1787 const unsigned char* extSecret; 1788 /*!< Reference to an external secret for the _withSecret variants, NULL 1789 * for other variants. */ 1790 /* note: there may be some padding at the end due to alignment on 64 bytes */ 1791 }; /* typedef'd to XXH3_state_t */ 1792 1793 #undef XXH_ALIGN_MEMBER 1794 1795 /*! 1796 * @brief Initializes a stack-allocated `XXH3_state_s`. 1797 * 1798 * When the @ref XXH3_state_t structure is merely emplaced on stack, 1799 * it should be initialized with XXH3_INITSTATE() or a memset() 1800 * in case its first reset uses XXH3_NNbits_reset_withSeed(). 1801 * This init can be omitted if the first reset uses default or _withSecret mode. 1802 * This operation isn't necessary when the state is created with XXH3_createState(). 1803 * Note that this doesn't prepare the state for a streaming operation, 1804 * it's still necessary to use XXH3_NNbits_reset*() afterwards. 1805 */ 1806 #define XXH3_INITSTATE(XXH3_state_ptr) \ 1807 do { \ 1808 XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ 1809 tmp_xxh3_state_ptr->seed = 0; \ 1810 tmp_xxh3_state_ptr->extSecret = NULL; \ 1811 } while(0) 1812 1813 1814 /*! 1815 * @brief Calculates the 128-bit hash of @p data using XXH3. 1816 * 1817 * @param data The block of data to be hashed, at least @p len bytes in size. 1818 * @param len The length of @p data, in bytes. 1819 * @param seed The 64-bit seed to alter the hash's output predictably. 1820 * 1821 * @pre 1822 * The memory between @p data and @p data + @p len must be valid, 1823 * readable, contiguous memory. However, if @p len is `0`, @p data may be 1824 * `NULL`. In C++, this also must be *TriviallyCopyable*. 1825 * 1826 * @return The calculated 128-bit XXH3 value. 1827 * 1828 * @see @ref single_shot_example "Single Shot Example" for an example. 1829 */ 1830 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); 1831 1832 1833 /* === Experimental API === */ 1834 /* Symbols defined below must be considered tied to a specific library version. */ 1835 1836 /*! 1837 * @brief Derive a high-entropy secret from any user-defined content, named customSeed. 1838 * 1839 * @param secretBuffer A writable buffer for derived high-entropy secret data. 1840 * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_SIZE_MIN. 1841 * @param customSeed A user-defined content. 1842 * @param customSeedSize Size of customSeed, in bytes. 1843 * 1844 * @return @ref XXH_OK on success. 1845 * @return @ref XXH_ERROR on failure. 1846 * 1847 * The generated secret can be used in combination with `*_withSecret()` functions. 1848 * The `_withSecret()` variants are useful to provide a higher level of protection 1849 * than 64-bit seed, as it becomes much more difficult for an external actor to 1850 * guess how to impact the calculation logic. 1851 * 1852 * The function accepts as input a custom seed of any length and any content, 1853 * and derives from it a high-entropy secret of length @p secretSize into an 1854 * already allocated buffer @p secretBuffer. 1855 * 1856 * The generated secret can then be used with any `*_withSecret()` variant. 1857 * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), 1858 * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() 1859 * are part of this list. They all accept a `secret` parameter 1860 * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) 1861 * _and_ feature very high entropy (consist of random-looking bytes). 1862 * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can 1863 * be employed to ensure proper quality. 1864 * 1865 * @p customSeed can be anything. It can have any size, even small ones, 1866 * and its content can be anything, even "poor entropy" sources such as a bunch 1867 * of zeroes. The resulting `secret` will nonetheless provide all required qualities. 1868 * 1869 * @pre 1870 * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN 1871 * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. 1872 * 1873 * Example code: 1874 * @code{.c} 1875 * #include <stdio.h> 1876 * #include <stdlib.h> 1877 * #include <string.h> 1878 * #define XXH_STATIC_LINKING_ONLY // expose unstable API 1879 * #include "xxhash.h" 1880 * // Hashes argv[2] using the entropy from argv[1]. 1881 * int main(int argc, char* argv[]) 1882 * { 1883 * char secret[XXH3_SECRET_SIZE_MIN]; 1884 * if (argv != 3) { return 1; } 1885 * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); 1886 * XXH64_hash_t h = XXH3_64bits_withSecret( 1887 * argv[2], strlen(argv[2]), 1888 * secret, sizeof(secret) 1889 * ); 1890 * printf("%016llx\n", (unsigned long long) h); 1891 * } 1892 * @endcode 1893 */ 1894 XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); 1895 1896 /*! 1897 * @brief Generate the same secret as the _withSeed() variants. 1898 * 1899 * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes 1900 * @param seed The 64-bit seed to alter the hash result predictably. 1901 * 1902 * The generated secret can be used in combination with 1903 *`*_withSecret()` and `_withSecretandSeed()` variants. 1904 * 1905 * Example C++ `std::string` hash class: 1906 * @code{.cpp} 1907 * #include <string> 1908 * #define XXH_STATIC_LINKING_ONLY // expose unstable API 1909 * #include "xxhash.h" 1910 * // Slow, seeds each time 1911 * class HashSlow { 1912 * XXH64_hash_t seed; 1913 * public: 1914 * HashSlow(XXH64_hash_t s) : seed{s} {} 1915 * size_t operator()(const std::string& x) const { 1916 * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; 1917 * } 1918 * }; 1919 * // Fast, caches the seeded secret for future uses. 1920 * class HashFast { 1921 * unsigned char secret[XXH3_SECRET_DEFAULT_SIZE]; 1922 * public: 1923 * HashFast(XXH64_hash_t s) { 1924 * XXH3_generateSecret_fromSeed(secret, seed); 1925 * } 1926 * size_t operator()(const std::string& x) const { 1927 * return size_t{ 1928 * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) 1929 * }; 1930 * } 1931 * }; 1932 * @endcode 1933 */ 1934 XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); 1935 1936 /*! 1937 * @brief Maximum size of "short" key in bytes. 1938 */ 1939 #define XXH3_MIDSIZE_MAX 240 1940 1941 /*! 1942 * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. 1943 * 1944 * @param data The block of data to be hashed, at least @p len bytes in size. 1945 * @param len The length of @p data, in bytes. 1946 * @param secret The secret data. 1947 * @param secretSize The length of @p secret, in bytes. 1948 * @param seed The 64-bit seed to alter the hash result predictably. 1949 * 1950 * These variants generate hash values using either: 1951 * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) 1952 * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). 1953 * 1954 * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. 1955 * `_withSeed()` has to generate the secret on the fly for "large" keys. 1956 * It's fast, but can be perceptible for "not so large" keys (< 1 KB). 1957 * `_withSecret()` has to generate the masks on the fly for "small" keys, 1958 * which requires more instructions than _withSeed() variants. 1959 * Therefore, _withSecretandSeed variant combines the best of both worlds. 1960 * 1961 * When @p secret has been generated by XXH3_generateSecret_fromSeed(), 1962 * this variant produces *exactly* the same results as `_withSeed()` variant, 1963 * hence offering only a pure speed benefit on "large" input, 1964 * by skipping the need to regenerate the secret for every large input. 1965 * 1966 * Another usage scenario is to hash the secret to a 64-bit hash value, 1967 * for example with XXH3_64bits(), which then becomes the seed, 1968 * and then employ both the seed and the secret in _withSecretandSeed(). 1969 * On top of speed, an added benefit is that each bit in the secret 1970 * has a 50% chance to swap each bit in the output, via its impact to the seed. 1971 * 1972 * This is not guaranteed when using the secret directly in "small data" scenarios, 1973 * because only portions of the secret are employed for small data. 1974 */ 1975 XXH_PUBLIC_API XXH_PUREF XXH64_hash_t 1976 XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, 1977 XXH_NOESCAPE const void* secret, size_t secretSize, 1978 XXH64_hash_t seed); 1979 1980 /*! 1981 * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. 1982 * 1983 * @param input The memory segment to be hashed, at least @p len bytes in size. 1984 * @param length The length of @p data, in bytes. 1985 * @param secret The secret used to alter hash result predictably. 1986 * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN) 1987 * @param seed64 The 64-bit seed to alter the hash result predictably. 1988 * 1989 * @return @ref XXH_OK on success. 1990 * @return @ref XXH_ERROR on failure. 1991 * 1992 * @see XXH3_64bits_withSecretandSeed(): contract is the same. 1993 */ 1994 XXH_PUBLIC_API XXH_PUREF XXH128_hash_t 1995 XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, 1996 XXH_NOESCAPE const void* secret, size_t secretSize, 1997 XXH64_hash_t seed64); 1998 1999 #ifndef XXH_NO_STREAM 2000 /*! 2001 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. 2002 * 2003 * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). 2004 * @param secret The secret data. 2005 * @param secretSize The length of @p secret, in bytes. 2006 * @param seed64 The 64-bit seed to alter the hash result predictably. 2007 * 2008 * @return @ref XXH_OK on success. 2009 * @return @ref XXH_ERROR on failure. 2010 * 2011 * @see XXH3_64bits_withSecretandSeed(). Contract is identical. 2012 */ 2013 XXH_PUBLIC_API XXH_errorcode 2014 XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, 2015 XXH_NOESCAPE const void* secret, size_t secretSize, 2016 XXH64_hash_t seed64); 2017 2018 /*! 2019 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. 2020 * 2021 * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). 2022 * @param secret The secret data. 2023 * @param secretSize The length of @p secret, in bytes. 2024 * @param seed64 The 64-bit seed to alter the hash result predictably. 2025 * 2026 * @return @ref XXH_OK on success. 2027 * @return @ref XXH_ERROR on failure. 2028 * 2029 * @see XXH3_64bits_withSecretandSeed(). Contract is identical. 2030 * 2031 * Note: there was a bug in an earlier version of this function (<= v0.8.2) 2032 * that would make it generate an incorrect hash value 2033 * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX 2034 * and @p secret is different from XXH3_generateSecret_fromSeed(). 2035 * As stated in the contract, the correct hash result must be 2036 * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX. 2037 * Results generated by this older version are wrong, hence not comparable. 2038 */ 2039 XXH_PUBLIC_API XXH_errorcode 2040 XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, 2041 XXH_NOESCAPE const void* secret, size_t secretSize, 2042 XXH64_hash_t seed64); 2043 2044 #endif /* !XXH_NO_STREAM */ 2045 2046 #endif /* !XXH_NO_XXH3 */ 2047 #endif /* XXH_NO_LONG_LONG */ 2048 #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) 2049 # define XXH_IMPLEMENTATION 2050 #endif 2051 2052 #endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ 2053 2054 2055 /* ======================================================================== */ 2056 /* ======================================================================== */ 2057 /* ======================================================================== */ 2058 2059 2060 /*-********************************************************************** 2061 * xxHash implementation 2062 *-********************************************************************** 2063 * xxHash's implementation used to be hosted inside xxhash.c. 2064 * 2065 * However, inlining requires implementation to be visible to the compiler, 2066 * hence be included alongside the header. 2067 * Previously, implementation was hosted inside xxhash.c, 2068 * which was then #included when inlining was activated. 2069 * This construction created issues with a few build and install systems, 2070 * as it required xxhash.c to be stored in /include directory. 2071 * 2072 * xxHash implementation is now directly integrated within xxhash.h. 2073 * As a consequence, xxhash.c is no longer needed in /include. 2074 * 2075 * xxhash.c is still available and is still useful. 2076 * In a "normal" setup, when xxhash is not inlined, 2077 * xxhash.h only exposes the prototypes and public symbols, 2078 * while xxhash.c can be built into an object file xxhash.o 2079 * which can then be linked into the final binary. 2080 ************************************************************************/ 2081 2082 #if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ 2083 || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) 2084 # define XXH_IMPLEM_13a8737387 2085 2086 /* ************************************* 2087 * Tuning parameters 2088 ***************************************/ 2089 2090 /*! 2091 * @defgroup tuning Tuning parameters 2092 * @{ 2093 * 2094 * Various macros to control xxHash's behavior. 2095 */ 2096 #ifdef XXH_DOXYGEN 2097 /*! 2098 * @brief Define this to disable 64-bit code. 2099 * 2100 * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. 2101 */ 2102 # define XXH_NO_LONG_LONG 2103 # undef XXH_NO_LONG_LONG /* don't actually */ 2104 /*! 2105 * @brief Controls how unaligned memory is accessed. 2106 * 2107 * By default, access to unaligned memory is controlled by `memcpy()`, which is 2108 * safe and portable. 2109 * 2110 * Unfortunately, on some target/compiler combinations, the generated assembly 2111 * is sub-optimal. 2112 * 2113 * The below switch allow selection of a different access method 2114 * in the search for improved performance. 2115 * 2116 * @par Possible options: 2117 * 2118 * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` 2119 * @par 2120 * Use `memcpy()`. Safe and portable. Note that most modern compilers will 2121 * eliminate the function call and treat it as an unaligned access. 2122 * 2123 * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` 2124 * @par 2125 * Depends on compiler extensions and is therefore not portable. 2126 * This method is safe _if_ your compiler supports it, 2127 * and *generally* as fast or faster than `memcpy`. 2128 * 2129 * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast 2130 * @par 2131 * Casts directly and dereferences. This method doesn't depend on the 2132 * compiler, but it violates the C standard as it directly dereferences an 2133 * unaligned pointer. It can generate buggy code on targets which do not 2134 * support unaligned memory accesses, but in some circumstances, it's the 2135 * only known way to get the most performance. 2136 * 2137 * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift 2138 * @par 2139 * Also portable. This can generate the best code on old compilers which don't 2140 * inline small `memcpy()` calls, and it might also be faster on big-endian 2141 * systems which lack a native byteswap instruction. However, some compilers 2142 * will emit literal byteshifts even if the target supports unaligned access. 2143 * 2144 * 2145 * @warning 2146 * Methods 1 and 2 rely on implementation-defined behavior. Use these with 2147 * care, as what works on one compiler/platform/optimization level may cause 2148 * another to read garbage data or even crash. 2149 * 2150 * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. 2151 * 2152 * Prefer these methods in priority order (0 > 3 > 1 > 2) 2153 */ 2154 # define XXH_FORCE_MEMORY_ACCESS 0 2155 2156 /*! 2157 * @def XXH_SIZE_OPT 2158 * @brief Controls how much xxHash optimizes for size. 2159 * 2160 * xxHash, when compiled, tends to result in a rather large binary size. This 2161 * is mostly due to heavy usage to forced inlining and constant folding of the 2162 * @ref XXH3_family to increase performance. 2163 * 2164 * However, some developers prefer size over speed. This option can 2165 * significantly reduce the size of the generated code. When using the `-Os` 2166 * or `-Oz` options on GCC or Clang, this is defined to 1 by default, 2167 * otherwise it is defined to 0. 2168 * 2169 * Most of these size optimizations can be controlled manually. 2170 * 2171 * This is a number from 0-2. 2172 * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed 2173 * comes first. 2174 * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more 2175 * conservative and disables hacks that increase code size. It implies the 2176 * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, 2177 * and @ref XXH3_NEON_LANES == 8 if they are not already defined. 2178 * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. 2179 * Performance may cry. For example, the single shot functions just use the 2180 * streaming API. 2181 */ 2182 # define XXH_SIZE_OPT 0 2183 2184 /*! 2185 * @def XXH_FORCE_ALIGN_CHECK 2186 * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() 2187 * and XXH64() only). 2188 * 2189 * This is an important performance trick for architectures without decent 2190 * unaligned memory access performance. 2191 * 2192 * It checks for input alignment, and when conditions are met, uses a "fast 2193 * path" employing direct 32-bit/64-bit reads, resulting in _dramatically 2194 * faster_ read speed. 2195 * 2196 * The check costs one initial branch per hash, which is generally negligible, 2197 * but not zero. 2198 * 2199 * Moreover, it's not useful to generate an additional code path if memory 2200 * access uses the same instruction for both aligned and unaligned 2201 * addresses (e.g. x86 and aarch64). 2202 * 2203 * In these cases, the alignment check can be removed by setting this macro to 0. 2204 * Then the code will always use unaligned memory access. 2205 * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips 2206 * which are platforms known to offer good unaligned memory accesses performance. 2207 * 2208 * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. 2209 * 2210 * This option does not affect XXH3 (only XXH32 and XXH64). 2211 */ 2212 # define XXH_FORCE_ALIGN_CHECK 0 2213 2214 /*! 2215 * @def XXH_NO_INLINE_HINTS 2216 * @brief When non-zero, sets all functions to `static`. 2217 * 2218 * By default, xxHash tries to force the compiler to inline almost all internal 2219 * functions. 2220 * 2221 * This can usually improve performance due to reduced jumping and improved 2222 * constant folding, but significantly increases the size of the binary which 2223 * might not be favorable. 2224 * 2225 * Additionally, sometimes the forced inlining can be detrimental to performance, 2226 * depending on the architecture. 2227 * 2228 * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the 2229 * compiler full control on whether to inline or not. 2230 * 2231 * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if 2232 * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. 2233 */ 2234 # define XXH_NO_INLINE_HINTS 0 2235 2236 /*! 2237 * @def XXH3_INLINE_SECRET 2238 * @brief Determines whether to inline the XXH3 withSecret code. 2239 * 2240 * When the secret size is known, the compiler can improve the performance 2241 * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). 2242 * 2243 * However, if the secret size is not known, it doesn't have any benefit. This 2244 * happens when xxHash is compiled into a global symbol. Therefore, if 2245 * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. 2246 * 2247 * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers 2248 * that are *sometimes* force inline on -Og, and it is impossible to automatically 2249 * detect this optimization level. 2250 */ 2251 # define XXH3_INLINE_SECRET 0 2252 2253 /*! 2254 * @def XXH32_ENDJMP 2255 * @brief Whether to use a jump for `XXH32_finalize`. 2256 * 2257 * For performance, `XXH32_finalize` uses multiple branches in the finalizer. 2258 * This is generally preferable for performance, 2259 * but depending on exact architecture, a jmp may be preferable. 2260 * 2261 * This setting is only possibly making a difference for very small inputs. 2262 */ 2263 # define XXH32_ENDJMP 0 2264 2265 /*! 2266 * @internal 2267 * @brief Redefines old internal names. 2268 * 2269 * For compatibility with code that uses xxHash's internals before the names 2270 * were changed to improve namespacing. There is no other reason to use this. 2271 */ 2272 # define XXH_OLD_NAMES 2273 # undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ 2274 2275 /*! 2276 * @def XXH_NO_STREAM 2277 * @brief Disables the streaming API. 2278 * 2279 * When xxHash is not inlined and the streaming functions are not used, disabling 2280 * the streaming functions can improve code size significantly, especially with 2281 * the @ref XXH3_family which tends to make constant folded copies of itself. 2282 */ 2283 # define XXH_NO_STREAM 2284 # undef XXH_NO_STREAM /* don't actually */ 2285 #endif /* XXH_DOXYGEN */ 2286 /*! 2287 * @} 2288 */ 2289 2290 #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ 2291 /* prefer __packed__ structures (method 1) for GCC 2292 * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy 2293 * which for some reason does unaligned loads. */ 2294 # if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) 2295 # define XXH_FORCE_MEMORY_ACCESS 1 2296 # endif 2297 #endif 2298 2299 #ifndef XXH_SIZE_OPT 2300 /* default to 1 for -Os or -Oz */ 2301 # if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) 2302 # define XXH_SIZE_OPT 1 2303 # else 2304 # define XXH_SIZE_OPT 0 2305 # endif 2306 #endif 2307 2308 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ 2309 /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ 2310 # if XXH_SIZE_OPT >= 1 || \ 2311 defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ 2312 || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ 2313 # define XXH_FORCE_ALIGN_CHECK 0 2314 # else 2315 # define XXH_FORCE_ALIGN_CHECK 1 2316 # endif 2317 #endif 2318 2319 #ifndef XXH_NO_INLINE_HINTS 2320 # if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ 2321 # define XXH_NO_INLINE_HINTS 1 2322 # else 2323 # define XXH_NO_INLINE_HINTS 0 2324 # endif 2325 #endif 2326 2327 #ifndef XXH3_INLINE_SECRET 2328 # if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ 2329 || !defined(XXH_INLINE_ALL) 2330 # define XXH3_INLINE_SECRET 0 2331 # else 2332 # define XXH3_INLINE_SECRET 1 2333 # endif 2334 #endif 2335 2336 #ifndef XXH32_ENDJMP 2337 /* generally preferable for performance */ 2338 # define XXH32_ENDJMP 0 2339 #endif 2340 2341 /*! 2342 * @defgroup impl Implementation 2343 * @{ 2344 */ 2345 2346 2347 /* ************************************* 2348 * Includes & Memory related functions 2349 ***************************************/ 2350 #if defined(XXH_NO_STREAM) 2351 /* nothing */ 2352 #elif defined(XXH_NO_STDLIB) 2353 2354 /* When requesting to disable any mention of stdlib, 2355 * the library loses the ability to invoked malloc / free. 2356 * In practice, it means that functions like `XXH*_createState()` 2357 * will always fail, and return NULL. 2358 * This flag is useful in situations where 2359 * xxhash.h is integrated into some kernel, embedded or limited environment 2360 * without access to dynamic allocation. 2361 */ 2362 2363 static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } 2364 static void XXH_free(void* p) { (void)p; } 2365 2366 #else 2367 2368 /* 2369 * Modify the local functions below should you wish to use 2370 * different memory routines for malloc() and free() 2371 */ 2372 #include <stdlib.h> 2373 2374 /*! 2375 * @internal 2376 * @brief Modify this function to use a different routine than malloc(). 2377 */ 2378 static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } 2379 2380 /*! 2381 * @internal 2382 * @brief Modify this function to use a different routine than free(). 2383 */ 2384 static void XXH_free(void* p) { free(p); } 2385 2386 #endif /* XXH_NO_STDLIB */ 2387 2388 #ifndef XXH_memcpy 2389 /*! 2390 * @internal 2391 * @brief XXH_memcpy() macro can be redirected at compile time 2392 */ 2393 # include <string.h> 2394 # define XXH_memcpy memcpy 2395 #endif 2396 2397 #ifndef XXH_memset 2398 /*! 2399 * @internal 2400 * @brief XXH_memset() macro can be redirected at compile time 2401 */ 2402 # include <string.h> 2403 # define XXH_memset memset 2404 #endif 2405 2406 #ifndef XXH_memcmp 2407 /*! 2408 * @internal 2409 * @brief XXH_memcmp() macro can be redirected at compile time 2410 * Note: only needed by XXH128. 2411 */ 2412 # include <string.h> 2413 # define XXH_memcmp memcmp 2414 #endif 2415 2416 2417 2418 #include <limits.h> /* ULLONG_MAX */ 2419 2420 2421 /* ************************************* 2422 * Compiler Specific Options 2423 ***************************************/ 2424 #ifdef _MSC_VER /* Visual Studio warning fix */ 2425 # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ 2426 #endif 2427 2428 #if XXH_NO_INLINE_HINTS /* disable inlining hints */ 2429 # if defined(__GNUC__) || defined(__clang__) 2430 # define XXH_FORCE_INLINE static __attribute__((__unused__)) 2431 # else 2432 # define XXH_FORCE_INLINE static 2433 # endif 2434 # define XXH_NO_INLINE static 2435 /* enable inlining hints */ 2436 #elif defined(__GNUC__) || defined(__clang__) 2437 # define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__)) 2438 # define XXH_NO_INLINE static __attribute__((__noinline__)) 2439 #elif defined(_MSC_VER) /* Visual Studio */ 2440 # define XXH_FORCE_INLINE static __forceinline 2441 # define XXH_NO_INLINE static __declspec(noinline) 2442 #elif defined (__cplusplus) \ 2443 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ 2444 # define XXH_FORCE_INLINE static inline 2445 # define XXH_NO_INLINE static 2446 #else 2447 # define XXH_FORCE_INLINE static 2448 # define XXH_NO_INLINE static 2449 #endif 2450 2451 #if defined(XXH_INLINE_ALL) 2452 # define XXH_STATIC XXH_FORCE_INLINE 2453 #else 2454 # define XXH_STATIC static 2455 #endif 2456 2457 #if XXH3_INLINE_SECRET 2458 # define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE 2459 #else 2460 # define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE 2461 #endif 2462 2463 #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ 2464 # define XXH_RESTRICT /* disable */ 2465 #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ 2466 # define XXH_RESTRICT restrict 2467 #elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ 2468 || (defined (__clang__)) \ 2469 || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ 2470 || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) 2471 /* 2472 * There are a LOT more compilers that recognize __restrict but this 2473 * covers the major ones. 2474 */ 2475 # define XXH_RESTRICT __restrict 2476 #else 2477 # define XXH_RESTRICT /* disable */ 2478 #endif 2479 2480 /* ************************************* 2481 * Debug 2482 ***************************************/ 2483 /*! 2484 * @ingroup tuning 2485 * @def XXH_DEBUGLEVEL 2486 * @brief Sets the debugging level. 2487 * 2488 * XXH_DEBUGLEVEL is expected to be defined externally, typically via the 2489 * compiler's command line options. The value must be a number. 2490 */ 2491 #ifndef XXH_DEBUGLEVEL 2492 # ifdef DEBUGLEVEL /* backwards compat */ 2493 # define XXH_DEBUGLEVEL DEBUGLEVEL 2494 # else 2495 # define XXH_DEBUGLEVEL 0 2496 # endif 2497 #endif 2498 2499 #if (XXH_DEBUGLEVEL>=1) 2500 # include <assert.h> /* note: can still be disabled with NDEBUG */ 2501 # define XXH_ASSERT(c) assert(c) 2502 #else 2503 # if defined(__INTEL_COMPILER) 2504 # define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) 2505 # else 2506 # define XXH_ASSERT(c) XXH_ASSUME(c) 2507 # endif 2508 #endif 2509 2510 /* note: use after variable declarations */ 2511 #ifndef XXH_STATIC_ASSERT 2512 # if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ 2513 # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) 2514 # elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ 2515 # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) 2516 # else 2517 # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) 2518 # endif 2519 # define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) 2520 #endif 2521 2522 /*! 2523 * @internal 2524 * @def XXH_COMPILER_GUARD(var) 2525 * @brief Used to prevent unwanted optimizations for @p var. 2526 * 2527 * It uses an empty GCC inline assembly statement with a register constraint 2528 * which forces @p var into a general purpose register (eg eax, ebx, ecx 2529 * on x86) and marks it as modified. 2530 * 2531 * This is used in a few places to avoid unwanted autovectorization (e.g. 2532 * XXH32_round()). All vectorization we want is explicit via intrinsics, 2533 * and _usually_ isn't wanted elsewhere. 2534 * 2535 * We also use it to prevent unwanted constant folding for AArch64 in 2536 * XXH3_initCustomSecret_scalar(). 2537 */ 2538 #if defined(__GNUC__) || defined(__clang__) 2539 # define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) 2540 #else 2541 # define XXH_COMPILER_GUARD(var) ((void)0) 2542 #endif 2543 2544 /* Specifically for NEON vectors which use the "w" constraint, on 2545 * Clang. */ 2546 #if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) 2547 # define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) 2548 #else 2549 # define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) 2550 #endif 2551 2552 /* ************************************* 2553 * Basic Types 2554 ***************************************/ 2555 #if !defined (__VMS) \ 2556 && (defined (__cplusplus) \ 2557 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) 2558 # ifdef _AIX 2559 # include <inttypes.h> 2560 # else 2561 # include <stdint.h> 2562 # endif 2563 typedef uint8_t xxh_u8; 2564 #else 2565 typedef unsigned char xxh_u8; 2566 #endif 2567 typedef XXH32_hash_t xxh_u32; 2568 2569 #ifdef XXH_OLD_NAMES 2570 # warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" 2571 # define BYTE xxh_u8 2572 # define U8 xxh_u8 2573 # define U32 xxh_u32 2574 #endif 2575 2576 /* *** Memory access *** */ 2577 2578 /*! 2579 * @internal 2580 * @fn xxh_u32 XXH_read32(const void* ptr) 2581 * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. 2582 * 2583 * Affected by @ref XXH_FORCE_MEMORY_ACCESS. 2584 * 2585 * @param ptr The pointer to read from. 2586 * @return The 32-bit native endian integer from the bytes at @p ptr. 2587 */ 2588 2589 /*! 2590 * @internal 2591 * @fn xxh_u32 XXH_readLE32(const void* ptr) 2592 * @brief Reads an unaligned 32-bit little endian integer from @p ptr. 2593 * 2594 * Affected by @ref XXH_FORCE_MEMORY_ACCESS. 2595 * 2596 * @param ptr The pointer to read from. 2597 * @return The 32-bit little endian integer from the bytes at @p ptr. 2598 */ 2599 2600 /*! 2601 * @internal 2602 * @fn xxh_u32 XXH_readBE32(const void* ptr) 2603 * @brief Reads an unaligned 32-bit big endian integer from @p ptr. 2604 * 2605 * Affected by @ref XXH_FORCE_MEMORY_ACCESS. 2606 * 2607 * @param ptr The pointer to read from. 2608 * @return The 32-bit big endian integer from the bytes at @p ptr. 2609 */ 2610 2611 /*! 2612 * @internal 2613 * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) 2614 * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. 2615 * 2616 * Affected by @ref XXH_FORCE_MEMORY_ACCESS. 2617 * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is 2618 * always @ref XXH_alignment::XXH_unaligned. 2619 * 2620 * @param ptr The pointer to read from. 2621 * @param align Whether @p ptr is aligned. 2622 * @pre 2623 * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte 2624 * aligned. 2625 * @return The 32-bit little endian integer from the bytes at @p ptr. 2626 */ 2627 2628 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) 2629 /* 2630 * Manual byteshift. Best for old compilers which don't inline memcpy. 2631 * We actually directly use XXH_readLE32 and XXH_readBE32. 2632 */ 2633 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) 2634 2635 /* 2636 * Force direct memory access. Only works on CPU which support unaligned memory 2637 * access in hardware. 2638 */ 2639 static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } 2640 2641 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) 2642 2643 /* 2644 * __attribute__((aligned(1))) is supported by gcc and clang. Originally the 2645 * documentation claimed that it only increased the alignment, but actually it 2646 * can decrease it on gcc, clang, and icc: 2647 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, 2648 * https://gcc.godbolt.org/z/xYez1j67Y. 2649 */ 2650 #ifdef XXH_OLD_NAMES 2651 typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign; 2652 #endif 2653 static xxh_u32 XXH_read32(const void* ptr) 2654 { 2655 typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u32 xxh_unalign32; 2656 return *((const xxh_unalign32*)ptr); 2657 } 2658 2659 #else 2660 2661 /* 2662 * Portable and safe solution. Generally efficient. 2663 * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html 2664 */ 2665 static xxh_u32 XXH_read32(const void* memPtr) 2666 { 2667 xxh_u32 val; 2668 XXH_memcpy(&val, memPtr, sizeof(val)); 2669 return val; 2670 } 2671 2672 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ 2673 2674 2675 /* *** Endianness *** */ 2676 2677 /*! 2678 * @ingroup tuning 2679 * @def XXH_CPU_LITTLE_ENDIAN 2680 * @brief Whether the target is little endian. 2681 * 2682 * Defined to 1 if the target is little endian, or 0 if it is big endian. 2683 * It can be defined externally, for example on the compiler command line. 2684 * 2685 * If it is not defined, 2686 * a runtime check (which is usually constant folded) is used instead. 2687 * 2688 * @note 2689 * This is not necessarily defined to an integer constant. 2690 * 2691 * @see XXH_isLittleEndian() for the runtime check. 2692 */ 2693 #ifndef XXH_CPU_LITTLE_ENDIAN 2694 /* 2695 * Try to detect endianness automatically, to avoid the nonstandard behavior 2696 * in `XXH_isLittleEndian()` 2697 */ 2698 # if defined(_WIN32) /* Windows is always little endian */ \ 2699 || defined(__LITTLE_ENDIAN__) \ 2700 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 2701 # define XXH_CPU_LITTLE_ENDIAN 1 2702 # elif defined(__BIG_ENDIAN__) \ 2703 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 2704 # define XXH_CPU_LITTLE_ENDIAN 0 2705 # else 2706 /*! 2707 * @internal 2708 * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. 2709 * 2710 * Most compilers will constant fold this. 2711 */ 2712 static int XXH_isLittleEndian(void) 2713 { 2714 /* 2715 * Portable and well-defined behavior. 2716 * Don't use static: it is detrimental to performance. 2717 */ 2718 const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; 2719 return one.c[0]; 2720 } 2721 # define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() 2722 # endif 2723 #endif 2724 2725 2726 2727 2728 /* **************************************** 2729 * Compiler-specific Functions and Macros 2730 ******************************************/ 2731 #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) 2732 2733 #ifdef __has_builtin 2734 # define XXH_HAS_BUILTIN(x) __has_builtin(x) 2735 #else 2736 # define XXH_HAS_BUILTIN(x) 0 2737 #endif 2738 2739 2740 2741 /* 2742 * C23 and future versions have standard "unreachable()". 2743 * Once it has been implemented reliably we can add it as an 2744 * additional case: 2745 * 2746 * ``` 2747 * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L) 2748 * # include <stddef.h> 2749 * # ifdef unreachable 2750 * # define XXH_UNREACHABLE() unreachable() 2751 * # endif 2752 * #endif 2753 * ``` 2754 * 2755 * Note C++23 also has std::unreachable() which can be detected 2756 * as follows: 2757 * ``` 2758 * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) 2759 * # include <utility> 2760 * # define XXH_UNREACHABLE() std::unreachable() 2761 * #endif 2762 * ``` 2763 * NB: `__cpp_lib_unreachable` is defined in the `<version>` header. 2764 * We don't use that as including `<utility>` in `extern "C"` blocks 2765 * doesn't work on GCC12 2766 */ 2767 2768 #if XXH_HAS_BUILTIN(__builtin_unreachable) 2769 # define XXH_UNREACHABLE() __builtin_unreachable() 2770 2771 #elif defined(_MSC_VER) 2772 # define XXH_UNREACHABLE() __assume(0) 2773 2774 #else 2775 # define XXH_UNREACHABLE() 2776 #endif 2777 2778 #if XXH_HAS_BUILTIN(__builtin_assume) 2779 # define XXH_ASSUME(c) __builtin_assume(c) 2780 #else 2781 # define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } 2782 #endif 2783 2784 /*! 2785 * @internal 2786 * @def XXH_rotl32(x,r) 2787 * @brief 32-bit rotate left. 2788 * 2789 * @param x The 32-bit integer to be rotated. 2790 * @param r The number of bits to rotate. 2791 * @pre 2792 * @p r > 0 && @p r < 32 2793 * @note 2794 * @p x and @p r may be evaluated multiple times. 2795 * @return The rotated result. 2796 */ 2797 #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ 2798 && XXH_HAS_BUILTIN(__builtin_rotateleft64) 2799 # define XXH_rotl32 __builtin_rotateleft32 2800 # define XXH_rotl64 __builtin_rotateleft64 2801 #elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left) 2802 # define XXH_rotl32 __builtin_stdc_rotate_left 2803 # define XXH_rotl64 __builtin_stdc_rotate_left 2804 /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ 2805 #elif defined(_MSC_VER) 2806 # define XXH_rotl32(x,r) _rotl(x,r) 2807 # define XXH_rotl64(x,r) _rotl64(x,r) 2808 #else 2809 # define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) 2810 # define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) 2811 #endif 2812 2813 /*! 2814 * @internal 2815 * @fn xxh_u32 XXH_swap32(xxh_u32 x) 2816 * @brief A 32-bit byteswap. 2817 * 2818 * @param x The 32-bit integer to byteswap. 2819 * @return @p x, byteswapped. 2820 */ 2821 #if defined(_MSC_VER) /* Visual Studio */ 2822 # define XXH_swap32 _byteswap_ulong 2823 #elif XXH_GCC_VERSION >= 403 2824 # define XXH_swap32 __builtin_bswap32 2825 #else 2826 static xxh_u32 XXH_swap32 (xxh_u32 x) 2827 { 2828 return ((x << 24) & 0xff000000 ) | 2829 ((x << 8) & 0x00ff0000 ) | 2830 ((x >> 8) & 0x0000ff00 ) | 2831 ((x >> 24) & 0x000000ff ); 2832 } 2833 #endif 2834 2835 2836 /* *************************** 2837 * Memory reads 2838 *****************************/ 2839 2840 /*! 2841 * @internal 2842 * @brief Enum to indicate whether a pointer is aligned. 2843 */ 2844 typedef enum { 2845 XXH_aligned, /*!< Aligned */ 2846 XXH_unaligned /*!< Possibly unaligned */ 2847 } XXH_alignment; 2848 2849 /* 2850 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. 2851 * 2852 * This is ideal for older compilers which don't inline memcpy. 2853 */ 2854 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) 2855 2856 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) 2857 { 2858 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; 2859 return bytePtr[0] 2860 | ((xxh_u32)bytePtr[1] << 8) 2861 | ((xxh_u32)bytePtr[2] << 16) 2862 | ((xxh_u32)bytePtr[3] << 24); 2863 } 2864 2865 XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) 2866 { 2867 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; 2868 return bytePtr[3] 2869 | ((xxh_u32)bytePtr[2] << 8) 2870 | ((xxh_u32)bytePtr[1] << 16) 2871 | ((xxh_u32)bytePtr[0] << 24); 2872 } 2873 2874 #else 2875 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) 2876 { 2877 return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); 2878 } 2879 2880 static xxh_u32 XXH_readBE32(const void* ptr) 2881 { 2882 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); 2883 } 2884 #endif 2885 2886 XXH_FORCE_INLINE xxh_u32 2887 XXH_readLE32_align(const void* ptr, XXH_alignment align) 2888 { 2889 if (align==XXH_unaligned) { 2890 return XXH_readLE32(ptr); 2891 } else { 2892 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); 2893 } 2894 } 2895 2896 2897 /* ************************************* 2898 * Misc 2899 ***************************************/ 2900 /*! @ingroup public */ 2901 XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } 2902 2903 2904 /* ******************************************************************* 2905 * 32-bit hash functions 2906 *********************************************************************/ 2907 /*! 2908 * @} 2909 * @defgroup XXH32_impl XXH32 implementation 2910 * @ingroup impl 2911 * 2912 * Details on the XXH32 implementation. 2913 * @{ 2914 */ 2915 /* #define instead of static const, to be used as initializers */ 2916 #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ 2917 #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ 2918 #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ 2919 #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ 2920 #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ 2921 2922 #ifdef XXH_OLD_NAMES 2923 # define PRIME32_1 XXH_PRIME32_1 2924 # define PRIME32_2 XXH_PRIME32_2 2925 # define PRIME32_3 XXH_PRIME32_3 2926 # define PRIME32_4 XXH_PRIME32_4 2927 # define PRIME32_5 XXH_PRIME32_5 2928 #endif 2929 2930 /*! 2931 * @internal 2932 * @brief Normal stripe processing routine. 2933 * 2934 * This shuffles the bits so that any bit from @p input impacts several bits in 2935 * @p acc. 2936 * 2937 * @param acc The accumulator lane. 2938 * @param input The stripe of input to mix. 2939 * @return The mixed accumulator lane. 2940 */ 2941 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) 2942 { 2943 acc += input * XXH_PRIME32_2; 2944 acc = XXH_rotl32(acc, 13); 2945 acc *= XXH_PRIME32_1; 2946 #if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) 2947 /* 2948 * UGLY HACK: 2949 * A compiler fence is used to prevent GCC and Clang from 2950 * autovectorizing the XXH32 loop (pragmas and attributes don't work for some 2951 * reason) without globally disabling SSE4.1. 2952 * 2953 * The reason we want to avoid vectorization is because despite working on 2954 * 4 integers at a time, there are multiple factors slowing XXH32 down on 2955 * SSE4: 2956 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on 2957 * newer chips!) making it slightly slower to multiply four integers at 2958 * once compared to four integers independently. Even when pmulld was 2959 * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE 2960 * just to multiply unless doing a long operation. 2961 * 2962 * - Four instructions are required to rotate, 2963 * movqda tmp, v // not required with VEX encoding 2964 * pslld tmp, 13 // tmp <<= 13 2965 * psrld v, 19 // x >>= 19 2966 * por v, tmp // x |= tmp 2967 * compared to one for scalar: 2968 * roll v, 13 // reliably fast across the board 2969 * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason 2970 * 2971 * - Instruction level parallelism is actually more beneficial here because 2972 * the SIMD actually serializes this operation: While v1 is rotating, v2 2973 * can load data, while v3 can multiply. SSE forces them to operate 2974 * together. 2975 * 2976 * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing 2977 * the loop. NEON is only faster on the A53, and with the newer cores, it is less 2978 * than half the speed. 2979 * 2980 * Additionally, this is used on WASM SIMD128 because it JITs to the same 2981 * SIMD instructions and has the same issue. 2982 */ 2983 XXH_COMPILER_GUARD(acc); 2984 #endif 2985 return acc; 2986 } 2987 2988 /*! 2989 * @internal 2990 * @brief Mixes all bits to finalize the hash. 2991 * 2992 * The final mix ensures that all input bits have a chance to impact any bit in 2993 * the output digest, resulting in an unbiased distribution. 2994 * 2995 * @param hash The hash to avalanche. 2996 * @return The avalanched hash. 2997 */ 2998 static xxh_u32 XXH32_avalanche(xxh_u32 hash) 2999 { 3000 hash ^= hash >> 15; 3001 hash *= XXH_PRIME32_2; 3002 hash ^= hash >> 13; 3003 hash *= XXH_PRIME32_3; 3004 hash ^= hash >> 16; 3005 return hash; 3006 } 3007 3008 #define XXH_get32bits(p) XXH_readLE32_align(p, align) 3009 3010 /*! 3011 * @internal 3012 * @brief Sets up the initial accumulator state for XXH32(). 3013 */ 3014 XXH_FORCE_INLINE void 3015 XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed) 3016 { 3017 XXH_ASSERT(acc != NULL); 3018 acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; 3019 acc[1] = seed + XXH_PRIME32_2; 3020 acc[2] = seed + 0; 3021 acc[3] = seed - XXH_PRIME32_1; 3022 } 3023 3024 /*! 3025 * @internal 3026 * @brief Consumes a block of data for XXH32(). 3027 * 3028 * @return the end input pointer. 3029 */ 3030 XXH_FORCE_INLINE const xxh_u8 * 3031 XXH32_consumeLong( 3032 xxh_u32 *XXH_RESTRICT acc, 3033 xxh_u8 const *XXH_RESTRICT input, 3034 size_t len, 3035 XXH_alignment align 3036 ) 3037 { 3038 const xxh_u8* const bEnd = input + len; 3039 const xxh_u8* const limit = bEnd - 15; 3040 XXH_ASSERT(acc != NULL); 3041 XXH_ASSERT(input != NULL); 3042 XXH_ASSERT(len >= 16); 3043 do { 3044 acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4; 3045 acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4; 3046 acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4; 3047 acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4; 3048 } while (input < limit); 3049 3050 return input; 3051 } 3052 3053 /*! 3054 * @internal 3055 * @brief Merges the accumulator lanes together for XXH32() 3056 */ 3057 XXH_FORCE_INLINE XXH_PUREF xxh_u32 3058 XXH32_mergeAccs(const xxh_u32 *acc) 3059 { 3060 XXH_ASSERT(acc != NULL); 3061 return XXH_rotl32(acc[0], 1) + XXH_rotl32(acc[1], 7) 3062 + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18); 3063 } 3064 3065 /*! 3066 * @internal 3067 * @brief Processes the last 0-15 bytes of @p ptr. 3068 * 3069 * There may be up to 15 bytes remaining to consume from the input. 3070 * This final stage will digest them to ensure that all input bytes are present 3071 * in the final mix. 3072 * 3073 * @param hash The hash to finalize. 3074 * @param ptr The pointer to the remaining input. 3075 * @param len The remaining length, modulo 16. 3076 * @param align Whether @p ptr is aligned. 3077 * @return The finalized hash. 3078 * @see XXH64_finalize(). 3079 */ 3080 static XXH_PUREF xxh_u32 3081 XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) 3082 { 3083 #define XXH_PROCESS1 do { \ 3084 hash += (*ptr++) * XXH_PRIME32_5; \ 3085 hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ 3086 } while (0) 3087 3088 #define XXH_PROCESS4 do { \ 3089 hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ 3090 ptr += 4; \ 3091 hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ 3092 } while (0) 3093 3094 if (ptr==NULL) XXH_ASSERT(len == 0); 3095 3096 /* Compact rerolled version; generally faster */ 3097 if (!XXH32_ENDJMP) { 3098 len &= 15; 3099 while (len >= 4) { 3100 XXH_PROCESS4; 3101 len -= 4; 3102 } 3103 while (len > 0) { 3104 XXH_PROCESS1; 3105 --len; 3106 } 3107 return XXH32_avalanche(hash); 3108 } else { 3109 switch(len&15) /* or switch(bEnd - p) */ { 3110 case 12: XXH_PROCESS4; 3111 XXH_FALLTHROUGH; /* fallthrough */ 3112 case 8: XXH_PROCESS4; 3113 XXH_FALLTHROUGH; /* fallthrough */ 3114 case 4: XXH_PROCESS4; 3115 return XXH32_avalanche(hash); 3116 3117 case 13: XXH_PROCESS4; 3118 XXH_FALLTHROUGH; /* fallthrough */ 3119 case 9: XXH_PROCESS4; 3120 XXH_FALLTHROUGH; /* fallthrough */ 3121 case 5: XXH_PROCESS4; 3122 XXH_PROCESS1; 3123 return XXH32_avalanche(hash); 3124 3125 case 14: XXH_PROCESS4; 3126 XXH_FALLTHROUGH; /* fallthrough */ 3127 case 10: XXH_PROCESS4; 3128 XXH_FALLTHROUGH; /* fallthrough */ 3129 case 6: XXH_PROCESS4; 3130 XXH_PROCESS1; 3131 XXH_PROCESS1; 3132 return XXH32_avalanche(hash); 3133 3134 case 15: XXH_PROCESS4; 3135 XXH_FALLTHROUGH; /* fallthrough */ 3136 case 11: XXH_PROCESS4; 3137 XXH_FALLTHROUGH; /* fallthrough */ 3138 case 7: XXH_PROCESS4; 3139 XXH_FALLTHROUGH; /* fallthrough */ 3140 case 3: XXH_PROCESS1; 3141 XXH_FALLTHROUGH; /* fallthrough */ 3142 case 2: XXH_PROCESS1; 3143 XXH_FALLTHROUGH; /* fallthrough */ 3144 case 1: XXH_PROCESS1; 3145 XXH_FALLTHROUGH; /* fallthrough */ 3146 case 0: return XXH32_avalanche(hash); 3147 } 3148 XXH_ASSERT(0); 3149 return hash; /* reaching this point is deemed impossible */ 3150 } 3151 } 3152 3153 #ifdef XXH_OLD_NAMES 3154 # define PROCESS1 XXH_PROCESS1 3155 # define PROCESS4 XXH_PROCESS4 3156 #else 3157 # undef XXH_PROCESS1 3158 # undef XXH_PROCESS4 3159 #endif 3160 3161 /*! 3162 * @internal 3163 * @brief The implementation for @ref XXH32(). 3164 * 3165 * @param input , len , seed Directly passed from @ref XXH32(). 3166 * @param align Whether @p input is aligned. 3167 * @return The calculated hash. 3168 */ 3169 XXH_FORCE_INLINE XXH_PUREF xxh_u32 3170 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) 3171 { 3172 xxh_u32 h32; 3173 3174 if (input==NULL) XXH_ASSERT(len == 0); 3175 3176 if (len>=16) { 3177 xxh_u32 acc[4]; 3178 XXH32_initAccs(acc, seed); 3179 3180 input = XXH32_consumeLong(acc, input, len, align); 3181 3182 h32 = XXH32_mergeAccs(acc); 3183 } else { 3184 h32 = seed + XXH_PRIME32_5; 3185 } 3186 3187 h32 += (xxh_u32)len; 3188 3189 return XXH32_finalize(h32, input, len&15, align); 3190 } 3191 3192 /*! @ingroup XXH32_family */ 3193 XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) 3194 { 3195 #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 3196 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ 3197 XXH32_state_t state; 3198 XXH32_reset(&state, seed); 3199 XXH32_update(&state, (const xxh_u8*)input, len); 3200 return XXH32_digest(&state); 3201 #else 3202 if (XXH_FORCE_ALIGN_CHECK) { 3203 if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ 3204 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); 3205 } } 3206 3207 return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); 3208 #endif 3209 } 3210 3211 3212 3213 /******* Hash streaming *******/ 3214 #ifndef XXH_NO_STREAM 3215 /*! @ingroup XXH32_family */ 3216 XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) 3217 { 3218 return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); 3219 } 3220 /*! @ingroup XXH32_family */ 3221 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) 3222 { 3223 XXH_free(statePtr); 3224 return XXH_OK; 3225 } 3226 3227 /*! @ingroup XXH32_family */ 3228 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) 3229 { 3230 XXH_memcpy(dstState, srcState, sizeof(*dstState)); 3231 } 3232 3233 /*! @ingroup XXH32_family */ 3234 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) 3235 { 3236 XXH_ASSERT(statePtr != NULL); 3237 XXH_memset(statePtr, 0, sizeof(*statePtr)); 3238 XXH32_initAccs(statePtr->acc, seed); 3239 return XXH_OK; 3240 } 3241 3242 3243 /*! @ingroup XXH32_family */ 3244 XXH_PUBLIC_API XXH_errorcode 3245 XXH32_update(XXH32_state_t* state, const void* input, size_t len) 3246 { 3247 if (input==NULL) { 3248 XXH_ASSERT(len == 0); 3249 return XXH_OK; 3250 } 3251 3252 state->total_len_32 += (XXH32_hash_t)len; 3253 state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); 3254 3255 XXH_ASSERT(state->bufferedSize < sizeof(state->buffer)); 3256 if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ 3257 XXH_memcpy(state->buffer + state->bufferedSize, input, len); 3258 state->bufferedSize += (XXH32_hash_t)len; 3259 return XXH_OK; 3260 } 3261 3262 { const xxh_u8* xinput = (const xxh_u8*)input; 3263 const xxh_u8* const bEnd = xinput + len; 3264 3265 if (state->bufferedSize) { /* non-empty buffer: complete first */ 3266 XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); 3267 xinput += sizeof(state->buffer) - state->bufferedSize; 3268 /* then process one round */ 3269 (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); 3270 state->bufferedSize = 0; 3271 } 3272 3273 XXH_ASSERT(xinput <= bEnd); 3274 if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { 3275 /* Process the remaining data */ 3276 xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); 3277 } 3278 3279 if (xinput < bEnd) { 3280 /* Copy the leftover to the tmp buffer */ 3281 XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); 3282 state->bufferedSize = (unsigned)(bEnd-xinput); 3283 } 3284 } 3285 3286 return XXH_OK; 3287 } 3288 3289 3290 /*! @ingroup XXH32_family */ 3291 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) 3292 { 3293 xxh_u32 h32; 3294 3295 if (state->large_len) { 3296 h32 = XXH32_mergeAccs(state->acc); 3297 } else { 3298 h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5; 3299 } 3300 3301 h32 += state->total_len_32; 3302 3303 return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned); 3304 } 3305 #endif /* !XXH_NO_STREAM */ 3306 3307 /******* Canonical representation *******/ 3308 3309 /*! @ingroup XXH32_family */ 3310 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) 3311 { 3312 XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); 3313 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); 3314 XXH_memcpy(dst, &hash, sizeof(*dst)); 3315 } 3316 /*! @ingroup XXH32_family */ 3317 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) 3318 { 3319 return XXH_readBE32(src); 3320 } 3321 3322 3323 #ifndef XXH_NO_LONG_LONG 3324 3325 /* ******************************************************************* 3326 * 64-bit hash functions 3327 *********************************************************************/ 3328 /*! 3329 * @} 3330 * @ingroup impl 3331 * @{ 3332 */ 3333 /******* Memory access *******/ 3334 3335 typedef XXH64_hash_t xxh_u64; 3336 3337 #ifdef XXH_OLD_NAMES 3338 # define U64 xxh_u64 3339 #endif 3340 3341 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) 3342 /* 3343 * Manual byteshift. Best for old compilers which don't inline memcpy. 3344 * We actually directly use XXH_readLE64 and XXH_readBE64. 3345 */ 3346 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) 3347 3348 /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ 3349 static xxh_u64 XXH_read64(const void* memPtr) 3350 { 3351 return *(const xxh_u64*) memPtr; 3352 } 3353 3354 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) 3355 3356 /* 3357 * __attribute__((aligned(1))) is supported by gcc and clang. Originally the 3358 * documentation claimed that it only increased the alignment, but actually it 3359 * can decrease it on gcc, clang, and icc: 3360 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, 3361 * https://gcc.godbolt.org/z/xYez1j67Y. 3362 */ 3363 #ifdef XXH_OLD_NAMES 3364 typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64; 3365 #endif 3366 static xxh_u64 XXH_read64(const void* ptr) 3367 { 3368 typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u64 xxh_unalign64; 3369 return *((const xxh_unalign64*)ptr); 3370 } 3371 3372 #else 3373 3374 /* 3375 * Portable and safe solution. Generally efficient. 3376 * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html 3377 */ 3378 static xxh_u64 XXH_read64(const void* memPtr) 3379 { 3380 xxh_u64 val; 3381 XXH_memcpy(&val, memPtr, sizeof(val)); 3382 return val; 3383 } 3384 3385 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ 3386 3387 #if defined(_MSC_VER) /* Visual Studio */ 3388 # define XXH_swap64 _byteswap_uint64 3389 #elif XXH_GCC_VERSION >= 403 3390 # define XXH_swap64 __builtin_bswap64 3391 #else 3392 static xxh_u64 XXH_swap64(xxh_u64 x) 3393 { 3394 return ((x << 56) & 0xff00000000000000ULL) | 3395 ((x << 40) & 0x00ff000000000000ULL) | 3396 ((x << 24) & 0x0000ff0000000000ULL) | 3397 ((x << 8) & 0x000000ff00000000ULL) | 3398 ((x >> 8) & 0x00000000ff000000ULL) | 3399 ((x >> 24) & 0x0000000000ff0000ULL) | 3400 ((x >> 40) & 0x000000000000ff00ULL) | 3401 ((x >> 56) & 0x00000000000000ffULL); 3402 } 3403 #endif 3404 3405 3406 /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ 3407 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) 3408 3409 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) 3410 { 3411 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; 3412 return bytePtr[0] 3413 | ((xxh_u64)bytePtr[1] << 8) 3414 | ((xxh_u64)bytePtr[2] << 16) 3415 | ((xxh_u64)bytePtr[3] << 24) 3416 | ((xxh_u64)bytePtr[4] << 32) 3417 | ((xxh_u64)bytePtr[5] << 40) 3418 | ((xxh_u64)bytePtr[6] << 48) 3419 | ((xxh_u64)bytePtr[7] << 56); 3420 } 3421 3422 XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) 3423 { 3424 const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; 3425 return bytePtr[7] 3426 | ((xxh_u64)bytePtr[6] << 8) 3427 | ((xxh_u64)bytePtr[5] << 16) 3428 | ((xxh_u64)bytePtr[4] << 24) 3429 | ((xxh_u64)bytePtr[3] << 32) 3430 | ((xxh_u64)bytePtr[2] << 40) 3431 | ((xxh_u64)bytePtr[1] << 48) 3432 | ((xxh_u64)bytePtr[0] << 56); 3433 } 3434 3435 #else 3436 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) 3437 { 3438 return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); 3439 } 3440 3441 static xxh_u64 XXH_readBE64(const void* ptr) 3442 { 3443 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); 3444 } 3445 #endif 3446 3447 XXH_FORCE_INLINE xxh_u64 3448 XXH_readLE64_align(const void* ptr, XXH_alignment align) 3449 { 3450 if (align==XXH_unaligned) 3451 return XXH_readLE64(ptr); 3452 else 3453 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); 3454 } 3455 3456 3457 /******* xxh64 *******/ 3458 /*! 3459 * @} 3460 * @defgroup XXH64_impl XXH64 implementation 3461 * @ingroup impl 3462 * 3463 * Details on the XXH64 implementation. 3464 * @{ 3465 */ 3466 /* #define rather that static const, to be used as initializers */ 3467 #define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ 3468 #define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ 3469 #define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ 3470 #define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ 3471 #define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ 3472 3473 #ifdef XXH_OLD_NAMES 3474 # define PRIME64_1 XXH_PRIME64_1 3475 # define PRIME64_2 XXH_PRIME64_2 3476 # define PRIME64_3 XXH_PRIME64_3 3477 # define PRIME64_4 XXH_PRIME64_4 3478 # define PRIME64_5 XXH_PRIME64_5 3479 #endif 3480 3481 /*! @copydoc XXH32_round */ 3482 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) 3483 { 3484 acc += input * XXH_PRIME64_2; 3485 acc = XXH_rotl64(acc, 31); 3486 acc *= XXH_PRIME64_1; 3487 #if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) 3488 /* 3489 * DISABLE AUTOVECTORIZATION: 3490 * A compiler fence is used to prevent GCC and Clang from 3491 * autovectorizing the XXH64 loop (pragmas and attributes don't work for some 3492 * reason) without globally disabling AVX512. 3493 * 3494 * Autovectorization of XXH64 tends to be detrimental, 3495 * though the exact outcome may change depending on exact cpu and compiler version. 3496 * For information, it has been reported as detrimental for Skylake-X, 3497 * but possibly beneficial for Zen4. 3498 * 3499 * The default is to disable auto-vectorization, 3500 * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. 3501 */ 3502 XXH_COMPILER_GUARD(acc); 3503 #endif 3504 return acc; 3505 } 3506 3507 static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) 3508 { 3509 val = XXH64_round(0, val); 3510 acc ^= val; 3511 acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; 3512 return acc; 3513 } 3514 3515 /*! @copydoc XXH32_avalanche */ 3516 static xxh_u64 XXH64_avalanche(xxh_u64 hash) 3517 { 3518 hash ^= hash >> 33; 3519 hash *= XXH_PRIME64_2; 3520 hash ^= hash >> 29; 3521 hash *= XXH_PRIME64_3; 3522 hash ^= hash >> 32; 3523 return hash; 3524 } 3525 3526 3527 #define XXH_get64bits(p) XXH_readLE64_align(p, align) 3528 3529 /*! 3530 * @internal 3531 * @brief Sets up the initial accumulator state for XXH64(). 3532 */ 3533 XXH_FORCE_INLINE void 3534 XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed) 3535 { 3536 XXH_ASSERT(acc != NULL); 3537 acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; 3538 acc[1] = seed + XXH_PRIME64_2; 3539 acc[2] = seed + 0; 3540 acc[3] = seed - XXH_PRIME64_1; 3541 } 3542 3543 /*! 3544 * @internal 3545 * @brief Consumes a block of data for XXH64(). 3546 * 3547 * @return the end input pointer. 3548 */ 3549 XXH_FORCE_INLINE const xxh_u8 * 3550 XXH64_consumeLong( 3551 xxh_u64 *XXH_RESTRICT acc, 3552 xxh_u8 const *XXH_RESTRICT input, 3553 size_t len, 3554 XXH_alignment align 3555 ) 3556 { 3557 const xxh_u8* const bEnd = input + len; 3558 const xxh_u8* const limit = bEnd - 31; 3559 XXH_ASSERT(acc != NULL); 3560 XXH_ASSERT(input != NULL); 3561 XXH_ASSERT(len >= 32); 3562 do { 3563 /* reroll on 32-bit */ 3564 if (sizeof(void *) < sizeof(xxh_u64)) { 3565 size_t i; 3566 for (i = 0; i < 4; i++) { 3567 acc[i] = XXH64_round(acc[i], XXH_get64bits(input)); 3568 input += 8; 3569 } 3570 } else { 3571 acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8; 3572 acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8; 3573 acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8; 3574 acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8; 3575 } 3576 } while (input < limit); 3577 3578 return input; 3579 } 3580 3581 /*! 3582 * @internal 3583 * @brief Merges the accumulator lanes together for XXH64() 3584 */ 3585 XXH_FORCE_INLINE XXH_PUREF xxh_u64 3586 XXH64_mergeAccs(const xxh_u64 *acc) 3587 { 3588 XXH_ASSERT(acc != NULL); 3589 { 3590 xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7) 3591 + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18); 3592 /* reroll on 32-bit */ 3593 if (sizeof(void *) < sizeof(xxh_u64)) { 3594 size_t i; 3595 for (i = 0; i < 4; i++) { 3596 h64 = XXH64_mergeRound(h64, acc[i]); 3597 } 3598 } else { 3599 h64 = XXH64_mergeRound(h64, acc[0]); 3600 h64 = XXH64_mergeRound(h64, acc[1]); 3601 h64 = XXH64_mergeRound(h64, acc[2]); 3602 h64 = XXH64_mergeRound(h64, acc[3]); 3603 } 3604 return h64; 3605 } 3606 } 3607 3608 /*! 3609 * @internal 3610 * @brief Processes the last 0-31 bytes of @p ptr. 3611 * 3612 * There may be up to 31 bytes remaining to consume from the input. 3613 * This final stage will digest them to ensure that all input bytes are present 3614 * in the final mix. 3615 * 3616 * @param hash The hash to finalize. 3617 * @param ptr The pointer to the remaining input. 3618 * @param len The remaining length, modulo 32. 3619 * @param align Whether @p ptr is aligned. 3620 * @return The finalized hash 3621 * @see XXH32_finalize(). 3622 */ 3623 XXH_STATIC XXH_PUREF xxh_u64 3624 XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) 3625 { 3626 if (ptr==NULL) XXH_ASSERT(len == 0); 3627 len &= 31; 3628 while (len >= 8) { 3629 xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); 3630 ptr += 8; 3631 hash ^= k1; 3632 hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; 3633 len -= 8; 3634 } 3635 if (len >= 4) { 3636 hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; 3637 ptr += 4; 3638 hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; 3639 len -= 4; 3640 } 3641 while (len > 0) { 3642 hash ^= (*ptr++) * XXH_PRIME64_5; 3643 hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; 3644 --len; 3645 } 3646 return XXH64_avalanche(hash); 3647 } 3648 3649 #ifdef XXH_OLD_NAMES 3650 # define PROCESS1_64 XXH_PROCESS1_64 3651 # define PROCESS4_64 XXH_PROCESS4_64 3652 # define PROCESS8_64 XXH_PROCESS8_64 3653 #else 3654 # undef XXH_PROCESS1_64 3655 # undef XXH_PROCESS4_64 3656 # undef XXH_PROCESS8_64 3657 #endif 3658 3659 /*! 3660 * @internal 3661 * @brief The implementation for @ref XXH64(). 3662 * 3663 * @param input , len , seed Directly passed from @ref XXH64(). 3664 * @param align Whether @p input is aligned. 3665 * @return The calculated hash. 3666 */ 3667 XXH_FORCE_INLINE XXH_PUREF xxh_u64 3668 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) 3669 { 3670 xxh_u64 h64; 3671 if (input==NULL) XXH_ASSERT(len == 0); 3672 3673 if (len>=32) { /* Process a large block of data */ 3674 xxh_u64 acc[4]; 3675 XXH64_initAccs(acc, seed); 3676 3677 input = XXH64_consumeLong(acc, input, len, align); 3678 3679 h64 = XXH64_mergeAccs(acc); 3680 } else { 3681 h64 = seed + XXH_PRIME64_5; 3682 } 3683 3684 h64 += (xxh_u64) len; 3685 3686 return XXH64_finalize(h64, input, len, align); 3687 } 3688 3689 3690 /*! @ingroup XXH64_family */ 3691 XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) 3692 { 3693 #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 3694 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ 3695 XXH64_state_t state; 3696 XXH64_reset(&state, seed); 3697 XXH64_update(&state, (const xxh_u8*)input, len); 3698 return XXH64_digest(&state); 3699 #else 3700 if (XXH_FORCE_ALIGN_CHECK) { 3701 if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ 3702 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); 3703 } } 3704 3705 return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); 3706 3707 #endif 3708 } 3709 3710 /******* Hash Streaming *******/ 3711 #ifndef XXH_NO_STREAM 3712 /*! @ingroup XXH64_family*/ 3713 XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) 3714 { 3715 return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); 3716 } 3717 /*! @ingroup XXH64_family */ 3718 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) 3719 { 3720 XXH_free(statePtr); 3721 return XXH_OK; 3722 } 3723 3724 /*! @ingroup XXH64_family */ 3725 XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) 3726 { 3727 XXH_memcpy(dstState, srcState, sizeof(*dstState)); 3728 } 3729 3730 /*! @ingroup XXH64_family */ 3731 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) 3732 { 3733 XXH_ASSERT(statePtr != NULL); 3734 XXH_memset(statePtr, 0, sizeof(*statePtr)); 3735 XXH64_initAccs(statePtr->acc, seed); 3736 return XXH_OK; 3737 } 3738 3739 /*! @ingroup XXH64_family */ 3740 XXH_PUBLIC_API XXH_errorcode 3741 XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) 3742 { 3743 if (input==NULL) { 3744 XXH_ASSERT(len == 0); 3745 return XXH_OK; 3746 } 3747 3748 state->total_len += len; 3749 3750 XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer)); 3751 if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ 3752 XXH_memcpy(state->buffer + state->bufferedSize, input, len); 3753 state->bufferedSize += (XXH32_hash_t)len; 3754 return XXH_OK; 3755 } 3756 3757 { const xxh_u8* xinput = (const xxh_u8*)input; 3758 const xxh_u8* const bEnd = xinput + len; 3759 3760 if (state->bufferedSize) { /* non-empty buffer => complete first */ 3761 XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); 3762 xinput += sizeof(state->buffer) - state->bufferedSize; 3763 /* and process one round */ 3764 (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); 3765 state->bufferedSize = 0; 3766 } 3767 3768 XXH_ASSERT(xinput <= bEnd); 3769 if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { 3770 /* Process the remaining data */ 3771 xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); 3772 } 3773 3774 if (xinput < bEnd) { 3775 /* Copy the leftover to the tmp buffer */ 3776 XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); 3777 state->bufferedSize = (unsigned)(bEnd-xinput); 3778 } 3779 } 3780 3781 return XXH_OK; 3782 } 3783 3784 3785 /*! @ingroup XXH64_family */ 3786 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) 3787 { 3788 xxh_u64 h64; 3789 3790 if (state->total_len >= 32) { 3791 h64 = XXH64_mergeAccs(state->acc); 3792 } else { 3793 h64 = state->acc[2] /*seed*/ + XXH_PRIME64_5; 3794 } 3795 3796 h64 += (xxh_u64) state->total_len; 3797 3798 return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned); 3799 } 3800 #endif /* !XXH_NO_STREAM */ 3801 3802 /******* Canonical representation *******/ 3803 3804 /*! @ingroup XXH64_family */ 3805 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) 3806 { 3807 XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); 3808 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); 3809 XXH_memcpy(dst, &hash, sizeof(*dst)); 3810 } 3811 3812 /*! @ingroup XXH64_family */ 3813 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) 3814 { 3815 return XXH_readBE64(src); 3816 } 3817 3818 #ifndef XXH_NO_XXH3 3819 3820 /* ********************************************************************* 3821 * XXH3 3822 * New generation hash designed for speed on small keys and vectorization 3823 ************************************************************************ */ 3824 /*! 3825 * @} 3826 * @defgroup XXH3_impl XXH3 implementation 3827 * @ingroup impl 3828 * @{ 3829 */ 3830 3831 /* === Compiler specifics === */ 3832 3833 3834 #if (defined(__GNUC__) && (__GNUC__ >= 3)) \ 3835 || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ 3836 || defined(__clang__) 3837 # define XXH_likely(x) __builtin_expect(x, 1) 3838 # define XXH_unlikely(x) __builtin_expect(x, 0) 3839 #else 3840 # define XXH_likely(x) (x) 3841 # define XXH_unlikely(x) (x) 3842 #endif 3843 3844 #ifndef XXH_HAS_INCLUDE 3845 # ifdef __has_include 3846 /* 3847 * Not defined as XXH_HAS_INCLUDE(x) (function-like) because 3848 * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) 3849 */ 3850 # define XXH_HAS_INCLUDE __has_include 3851 # else 3852 # define XXH_HAS_INCLUDE(x) 0 3853 # endif 3854 #endif 3855 3856 #if defined(__GNUC__) || defined(__clang__) 3857 # if defined(__ARM_FEATURE_SVE) 3858 # include <arm_sve.h> 3859 # endif 3860 # if defined(__ARM_NEON__) || defined(__ARM_NEON) \ 3861 || (defined(_M_ARM) && _M_ARM >= 7) \ 3862 || defined(_M_ARM64) || defined(_M_ARM64EC) \ 3863 || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */ 3864 # define inline __inline__ /* circumvent a clang bug */ 3865 # include <arm_neon.h> 3866 # undef inline 3867 # elif defined(__AVX2__) 3868 # include <immintrin.h> 3869 # elif defined(__SSE2__) 3870 # include <emmintrin.h> 3871 # elif defined(__loongarch_asx) 3872 # include <lasxintrin.h> 3873 # include <lsxintrin.h> 3874 # elif defined(__loongarch_sx) 3875 # include <lsxintrin.h> 3876 # elif defined(__riscv_vector) 3877 # include <riscv_vector.h> 3878 # endif 3879 #endif 3880 3881 #if defined(_MSC_VER) 3882 # include <intrin.h> 3883 #endif 3884 3885 /* 3886 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while 3887 * remaining a true 64-bit/128-bit hash function. 3888 * 3889 * This is done by prioritizing a subset of 64-bit operations that can be 3890 * emulated without too many steps on the average 32-bit machine. 3891 * 3892 * For example, these two lines seem similar, and run equally fast on 64-bit: 3893 * 3894 * xxh_u64 x; 3895 * x ^= (x >> 47); // good 3896 * x ^= (x >> 13); // bad 3897 * 3898 * However, to a 32-bit machine, there is a major difference. 3899 * 3900 * x ^= (x >> 47) looks like this: 3901 * 3902 * x.lo ^= (x.hi >> (47 - 32)); 3903 * 3904 * while x ^= (x >> 13) looks like this: 3905 * 3906 * // note: funnel shifts are not usually cheap. 3907 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); 3908 * x.hi ^= (x.hi >> 13); 3909 * 3910 * The first one is significantly faster than the second, simply because the 3911 * shift is larger than 32. This means: 3912 * - All the bits we need are in the upper 32 bits, so we can ignore the lower 3913 * 32 bits in the shift. 3914 * - The shift result will always fit in the lower 32 bits, and therefore, 3915 * we can ignore the upper 32 bits in the xor. 3916 * 3917 * Thanks to this optimization, XXH3 only requires these features to be efficient: 3918 * 3919 * - Usable unaligned access 3920 * - A 32-bit or 64-bit ALU 3921 * - If 32-bit, a decent ADC instruction 3922 * - A 32 or 64-bit multiply with a 64-bit result 3923 * - For the 128-bit variant, a decent byteswap helps short inputs. 3924 * 3925 * The first two are already required by XXH32, and almost all 32-bit and 64-bit 3926 * platforms which can run XXH32 can run XXH3 efficiently. 3927 * 3928 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one 3929 * notable exception. 3930 * 3931 * First of all, Thumb-1 lacks support for the UMULL instruction which 3932 * performs the important long multiply. This means numerous __aeabi_lmul 3933 * calls. 3934 * 3935 * Second of all, the 8 functional registers are just not enough. 3936 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need 3937 * Lo registers, and this shuffling results in thousands more MOVs than A32. 3938 * 3939 * A32 and T32 don't have this limitation. They can access all 14 registers, 3940 * do a 32->64 multiply with UMULL, and the flexible operand allowing free 3941 * shifts is helpful, too. 3942 * 3943 * Therefore, we do a quick sanity check. 3944 * 3945 * If compiling Thumb-1 for a target which supports ARM instructions, we will 3946 * emit a warning, as it is not a "sane" platform to compile for. 3947 * 3948 * Usually, if this happens, it is because of an accident and you probably need 3949 * to specify -march, as you likely meant to compile for a newer architecture. 3950 * 3951 * Credit: large sections of the vectorial and asm source code paths 3952 * have been contributed by @easyaspi314 3953 */ 3954 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) 3955 # warning "XXH3 is highly inefficient without ARM or Thumb-2." 3956 #endif 3957 3958 /* ========================================== 3959 * Vectorization detection 3960 * ========================================== */ 3961 3962 #ifdef XXH_DOXYGEN 3963 /*! 3964 * @ingroup tuning 3965 * @brief Overrides the vectorization implementation chosen for XXH3. 3966 * 3967 * Can be defined to 0 to disable SIMD, 3968 * or any other authorized value of @ref XXH_VECTOR. 3969 * 3970 * If this is not defined, it uses predefined macros to determine the best 3971 * implementation. 3972 */ 3973 # define XXH_VECTOR XXH_SCALAR 3974 /*! 3975 * @ingroup tuning 3976 * @brief Selects the minimum alignment for XXH3's accumulators. 3977 * 3978 * When using SIMD, this should match the alignment required for said vector 3979 * type, so, for example, 32 for AVX2. 3980 * 3981 * Default: Auto detected. 3982 */ 3983 # define XXH_ACC_ALIGN 8 3984 #endif 3985 3986 /* Actual definition */ 3987 #ifndef XXH_DOXYGEN 3988 #endif 3989 3990 #ifndef XXH_VECTOR /* can be defined on command line */ 3991 # if ( \ 3992 defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ 3993 || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ 3994 || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \ 3995 ) && ( \ 3996 defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ 3997 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ 3998 ) 3999 # define XXH_VECTOR XXH_NEON 4000 # elif defined(__ARM_FEATURE_SVE) 4001 # define XXH_VECTOR XXH_SVE 4002 # elif defined(__AVX512F__) 4003 # define XXH_VECTOR XXH_AVX512 4004 # elif defined(__AVX2__) 4005 # define XXH_VECTOR XXH_AVX2 4006 # elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) 4007 # define XXH_VECTOR XXH_SSE2 4008 # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ 4009 || (defined(__s390x__) && defined(__VEC__)) \ 4010 && defined(__GNUC__) /* TODO: IBM XL */ 4011 # define XXH_VECTOR XXH_VSX 4012 # elif defined(__loongarch_asx) 4013 # define XXH_VECTOR XXH_LASX 4014 # elif defined(__loongarch_sx) 4015 # define XXH_VECTOR XXH_LSX 4016 # elif defined(__riscv_vector) 4017 # define XXH_VECTOR XXH_RVV 4018 # else 4019 # define XXH_VECTOR XXH_SCALAR 4020 # endif 4021 #endif 4022 4023 /* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ 4024 #if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) 4025 # ifdef _MSC_VER 4026 # pragma warning(once : 4606) 4027 # else 4028 # warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." 4029 # endif 4030 # undef XXH_VECTOR 4031 # define XXH_VECTOR XXH_SCALAR 4032 #endif 4033 4034 /* 4035 * Controls the alignment of the accumulator, 4036 * for compatibility with aligned vector loads, which are usually faster. 4037 */ 4038 #ifndef XXH_ACC_ALIGN 4039 # if defined(XXH_X86DISPATCH) 4040 # define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ 4041 # elif XXH_VECTOR == XXH_SCALAR /* scalar */ 4042 # define XXH_ACC_ALIGN 8 4043 # elif XXH_VECTOR == XXH_SSE2 /* sse2 */ 4044 # define XXH_ACC_ALIGN 16 4045 # elif XXH_VECTOR == XXH_AVX2 /* avx2 */ 4046 # define XXH_ACC_ALIGN 32 4047 # elif XXH_VECTOR == XXH_NEON /* neon */ 4048 # define XXH_ACC_ALIGN 16 4049 # elif XXH_VECTOR == XXH_VSX /* vsx */ 4050 # define XXH_ACC_ALIGN 16 4051 # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ 4052 # define XXH_ACC_ALIGN 64 4053 # elif XXH_VECTOR == XXH_SVE /* sve */ 4054 # define XXH_ACC_ALIGN 64 4055 # elif XXH_VECTOR == XXH_LASX /* lasx */ 4056 # define XXH_ACC_ALIGN 64 4057 # elif XXH_VECTOR == XXH_LSX /* lsx */ 4058 # define XXH_ACC_ALIGN 64 4059 # elif XXH_VECTOR == XXH_RVV /* rvv */ 4060 # define XXH_ACC_ALIGN 64 /* could be 8, but 64 may be faster */ 4061 # endif 4062 #endif 4063 4064 #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ 4065 || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 4066 # define XXH_SEC_ALIGN XXH_ACC_ALIGN 4067 #elif XXH_VECTOR == XXH_SVE 4068 # define XXH_SEC_ALIGN XXH_ACC_ALIGN 4069 #elif XXH_VECTOR == XXH_RVV 4070 # define XXH_SEC_ALIGN XXH_ACC_ALIGN 4071 #else 4072 # define XXH_SEC_ALIGN 8 4073 #endif 4074 4075 #if defined(__GNUC__) || defined(__clang__) 4076 # define XXH_ALIASING __attribute__((__may_alias__)) 4077 #else 4078 # define XXH_ALIASING /* nothing */ 4079 #endif 4080 4081 /* 4082 * UGLY HACK: 4083 * GCC usually generates the best code with -O3 for xxHash. 4084 * 4085 * However, when targeting AVX2, it is overzealous in its unrolling resulting 4086 * in code roughly 3/4 the speed of Clang. 4087 * 4088 * There are other issues, such as GCC splitting _mm256_loadu_si256 into 4089 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which 4090 * only applies to Sandy and Ivy Bridge... which don't even support AVX2. 4091 * 4092 * That is why when compiling the AVX2 version, it is recommended to use either 4093 * -O2 -mavx2 -march=haswell 4094 * or 4095 * -O2 -mavx2 -mno-avx256-split-unaligned-load 4096 * for decent performance, or to use Clang instead. 4097 * 4098 * Fortunately, we can control the first one with a pragma that forces GCC into 4099 * -O2, but the other one we can't control without "failed to inline always 4100 * inline function due to target mismatch" warnings. 4101 */ 4102 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 4103 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 4104 && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ 4105 # pragma GCC push_options 4106 # pragma GCC optimize("-O2") 4107 #endif 4108 4109 #if XXH_VECTOR == XXH_NEON 4110 4111 /* 4112 * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 4113 * optimizes out the entire hashLong loop because of the aliasing violation. 4114 * 4115 * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, 4116 * so the only option is to mark it as aliasing. 4117 */ 4118 typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; 4119 4120 /*! 4121 * @internal 4122 * @brief `vld1q_u64` but faster and alignment-safe. 4123 * 4124 * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only 4125 * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). 4126 * 4127 * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it 4128 * prohibits load-store optimizations. Therefore, a direct dereference is used. 4129 * 4130 * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe 4131 * unaligned load. 4132 */ 4133 #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) 4134 XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ 4135 { 4136 return *(xxh_aliasing_uint64x2_t const *)ptr; 4137 } 4138 #else 4139 XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) 4140 { 4141 return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); 4142 } 4143 #endif 4144 4145 /*! 4146 * @internal 4147 * @brief `vmlal_u32` on low and high halves of a vector. 4148 * 4149 * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with 4150 * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` 4151 * with `vmlal_u32`. 4152 */ 4153 #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 4154 XXH_FORCE_INLINE uint64x2_t 4155 XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) 4156 { 4157 /* Inline assembly is the only way */ 4158 __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); 4159 return acc; 4160 } 4161 XXH_FORCE_INLINE uint64x2_t 4162 XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) 4163 { 4164 /* This intrinsic works as expected */ 4165 return vmlal_high_u32(acc, lhs, rhs); 4166 } 4167 #else 4168 /* Portable intrinsic versions */ 4169 XXH_FORCE_INLINE uint64x2_t 4170 XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) 4171 { 4172 return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); 4173 } 4174 /*! @copydoc XXH_vmlal_low_u32 4175 * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ 4176 XXH_FORCE_INLINE uint64x2_t 4177 XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) 4178 { 4179 return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); 4180 } 4181 #endif 4182 4183 /*! 4184 * @ingroup tuning 4185 * @brief Controls the NEON to scalar ratio for XXH3 4186 * 4187 * This can be set to 2, 4, 6, or 8. 4188 * 4189 * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. 4190 * 4191 * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those 4192 * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU 4193 * bandwidth. 4194 * 4195 * This is even more noticeable on the more advanced cores like the Cortex-A76 which 4196 * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. 4197 * 4198 * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes 4199 * and 2 scalar lanes, which is chosen by default. 4200 * 4201 * This does not apply to Apple processors or 32-bit processors, which run better with 4202 * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. 4203 * 4204 * This change benefits CPUs with large micro-op buffers without negatively affecting 4205 * most other CPUs: 4206 * 4207 * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | 4208 * |:----------------------|:--------------------|----------:|-----------:|------:| 4209 * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | 4210 * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | 4211 * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | 4212 * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | 4213 * 4214 * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. 4215 * 4216 * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning 4217 * it effectively becomes worse 4. 4218 * 4219 * @see XXH3_accumulate_512_neon() 4220 */ 4221 # ifndef XXH3_NEON_LANES 4222 # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ 4223 && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 4224 # define XXH3_NEON_LANES 6 4225 # else 4226 # define XXH3_NEON_LANES XXH_ACC_NB 4227 # endif 4228 # endif 4229 #endif /* XXH_VECTOR == XXH_NEON */ 4230 4231 /* 4232 * VSX and Z Vector helpers. 4233 * 4234 * This is very messy, and any pull requests to clean this up are welcome. 4235 * 4236 * There are a lot of problems with supporting VSX and s390x, due to 4237 * inconsistent intrinsics, spotty coverage, and multiple endiannesses. 4238 */ 4239 #if XXH_VECTOR == XXH_VSX 4240 /* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, 4241 * and `pixel`. This is a problem for obvious reasons. 4242 * 4243 * These keywords are unnecessary; the spec literally says they are 4244 * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd 4245 * after including the header. 4246 * 4247 * We use pragma push_macro/pop_macro to keep the namespace clean. */ 4248 # pragma push_macro("bool") 4249 # pragma push_macro("vector") 4250 # pragma push_macro("pixel") 4251 /* silence potential macro redefined warnings */ 4252 # undef bool 4253 # undef vector 4254 # undef pixel 4255 4256 # if defined(__s390x__) 4257 # include <s390intrin.h> 4258 # else 4259 # include <altivec.h> 4260 # endif 4261 4262 /* Restore the original macro values, if applicable. */ 4263 # pragma pop_macro("pixel") 4264 # pragma pop_macro("vector") 4265 # pragma pop_macro("bool") 4266 4267 typedef __vector unsigned long long xxh_u64x2; 4268 typedef __vector unsigned char xxh_u8x16; 4269 typedef __vector unsigned xxh_u32x4; 4270 4271 /* 4272 * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. 4273 */ 4274 typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; 4275 4276 # ifndef XXH_VSX_BE 4277 # if defined(__BIG_ENDIAN__) \ 4278 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 4279 # define XXH_VSX_BE 1 4280 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ 4281 # warning "-maltivec=be is not recommended. Please use native endianness." 4282 # define XXH_VSX_BE 1 4283 # else 4284 # define XXH_VSX_BE 0 4285 # endif 4286 # endif /* !defined(XXH_VSX_BE) */ 4287 4288 # if XXH_VSX_BE 4289 # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) 4290 # define XXH_vec_revb vec_revb 4291 # else 4292 /*! 4293 * A polyfill for POWER9's vec_revb(). 4294 */ 4295 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) 4296 { 4297 xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 4298 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; 4299 return vec_perm(val, val, vByteSwap); 4300 } 4301 # endif 4302 # endif /* XXH_VSX_BE */ 4303 4304 /*! 4305 * Performs an unaligned vector load and byte swaps it on big endian. 4306 */ 4307 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) 4308 { 4309 xxh_u64x2 ret; 4310 XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); 4311 # if XXH_VSX_BE 4312 ret = XXH_vec_revb(ret); 4313 # endif 4314 return ret; 4315 } 4316 4317 /* 4318 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC 4319 * 4320 * These intrinsics weren't added until GCC 8, despite existing for a while, 4321 * and they are endian dependent. Also, their meaning swap depending on version. 4322 * */ 4323 # if defined(__s390x__) 4324 /* s390x is always big endian, no issue on this platform */ 4325 # define XXH_vec_mulo vec_mulo 4326 # define XXH_vec_mule vec_mule 4327 # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) 4328 /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ 4329 /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ 4330 # define XXH_vec_mulo __builtin_altivec_vmulouw 4331 # define XXH_vec_mule __builtin_altivec_vmuleuw 4332 # else 4333 /* gcc needs inline assembly */ 4334 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ 4335 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) 4336 { 4337 xxh_u64x2 result; 4338 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); 4339 return result; 4340 } 4341 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) 4342 { 4343 xxh_u64x2 result; 4344 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); 4345 return result; 4346 } 4347 # endif /* XXH_vec_mulo, XXH_vec_mule */ 4348 #endif /* XXH_VECTOR == XXH_VSX */ 4349 4350 #if XXH_VECTOR == XXH_SVE 4351 #define ACCRND(acc, offset) \ 4352 do { \ 4353 svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ 4354 svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ 4355 svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ 4356 svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ 4357 svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ 4358 svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ 4359 svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ 4360 acc = svadd_u64_x(mask, acc, mul); \ 4361 } while (0) 4362 #endif /* XXH_VECTOR == XXH_SVE */ 4363 4364 /* prefetch 4365 * can be disabled, by declaring XXH_NO_PREFETCH build macro */ 4366 #if defined(XXH_NO_PREFETCH) 4367 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 4368 #else 4369 # if XXH_SIZE_OPT >= 1 4370 # define XXH_PREFETCH(ptr) (void)(ptr) 4371 # elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ 4372 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ 4373 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) 4374 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) 4375 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) 4376 # else 4377 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ 4378 # endif 4379 #endif /* XXH_NO_PREFETCH */ 4380 4381 4382 /* ========================================== 4383 * XXH3 default settings 4384 * ========================================== */ 4385 4386 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ 4387 4388 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) 4389 # error "default keyset is not large enough" 4390 #endif 4391 4392 /*! 4393 * @internal 4394 * @def XXH3_kSecret 4395 * @brief Pseudorandom secret taken directly from FARSH. */ 4396 XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { 4397 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, 4398 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 4399 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, 4400 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, 4401 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 4402 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, 4403 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, 4404 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 4405 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, 4406 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, 4407 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 4408 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, 4409 }; 4410 4411 static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ 4412 static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ 4413 4414 #ifdef XXH_OLD_NAMES 4415 # define kSecret XXH3_kSecret 4416 #endif 4417 4418 #ifdef XXH_DOXYGEN 4419 /*! 4420 * @brief Calculates a 32-bit to 64-bit long multiply. 4421 * 4422 * Implemented as a macro. 4423 * 4424 * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't 4425 * need to (but it shouldn't need to anyways, it is about 7 instructions to do 4426 * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we 4427 * use that instead of the normal method. 4428 * 4429 * If you are compiling for platforms like Thumb-1 and don't have a better option, 4430 * you may also want to write your own long multiply routine here. 4431 * 4432 * @param x, y Numbers to be multiplied 4433 * @return 64-bit product of the low 32 bits of @p x and @p y. 4434 */ 4435 XXH_FORCE_INLINE xxh_u64 4436 XXH_mult32to64(xxh_u64 x, xxh_u64 y) 4437 { 4438 return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); 4439 } 4440 #elif defined(_MSC_VER) && defined(_M_IX86) 4441 # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) 4442 #else 4443 /* 4444 * Downcast + upcast is usually better than masking on older compilers like 4445 * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. 4446 * 4447 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands 4448 * and perform a full 64x64 multiply -- entirely redundant on 32-bit. 4449 */ 4450 # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) 4451 #endif 4452 4453 /*! 4454 * @brief Calculates a 64->128-bit long multiply. 4455 * 4456 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar 4457 * version. 4458 * 4459 * @param lhs , rhs The 64-bit integers to be multiplied 4460 * @return The 128-bit result represented in an @ref XXH128_hash_t. 4461 */ 4462 static XXH128_hash_t 4463 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) 4464 { 4465 /* 4466 * GCC/Clang __uint128_t method. 4467 * 4468 * On most 64-bit targets, GCC and Clang define a __uint128_t type. 4469 * This is usually the best way as it usually uses a native long 64-bit 4470 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. 4471 * 4472 * Usually. 4473 * 4474 * Despite being a 32-bit platform, Clang (and emscripten) define this type 4475 * despite not having the arithmetic for it. This results in a laggy 4476 * compiler builtin call which calculates a full 128-bit multiply. 4477 * In that case it is best to use the portable one. 4478 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 4479 */ 4480 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ 4481 && defined(__SIZEOF_INT128__) \ 4482 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) 4483 4484 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; 4485 XXH128_hash_t r128; 4486 r128.low64 = (xxh_u64)(product); 4487 r128.high64 = (xxh_u64)(product >> 64); 4488 return r128; 4489 4490 /* 4491 * MSVC for x64's _umul128 method. 4492 * 4493 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); 4494 * 4495 * This compiles to single operand MUL on x64. 4496 */ 4497 #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) 4498 4499 #ifndef _MSC_VER 4500 # pragma intrinsic(_umul128) 4501 #endif 4502 xxh_u64 product_high; 4503 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); 4504 XXH128_hash_t r128; 4505 r128.low64 = product_low; 4506 r128.high64 = product_high; 4507 return r128; 4508 4509 /* 4510 * MSVC for ARM64's __umulh method. 4511 * 4512 * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. 4513 */ 4514 #elif defined(_M_ARM64) || defined(_M_ARM64EC) 4515 4516 #ifndef _MSC_VER 4517 # pragma intrinsic(__umulh) 4518 #endif 4519 XXH128_hash_t r128; 4520 r128.low64 = lhs * rhs; 4521 r128.high64 = __umulh(lhs, rhs); 4522 return r128; 4523 4524 #else 4525 /* 4526 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. 4527 * 4528 * This is a fast and simple grade school multiply, which is shown below 4529 * with base 10 arithmetic instead of base 0x100000000. 4530 * 4531 * 9 3 // D2 lhs = 93 4532 * x 7 5 // D2 rhs = 75 4533 * ---------- 4534 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 4535 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 4536 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 4537 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 4538 * --------- 4539 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 4540 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 4541 * --------- 4542 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 4543 * 4544 * The reasons for adding the products like this are: 4545 * 1. It avoids manual carry tracking. Just like how 4546 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. 4547 * This avoids a lot of complexity. 4548 * 4549 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL 4550 * instruction available in ARM's Digital Signal Processing extension 4551 * in 32-bit ARMv6 and later, which is shown below: 4552 * 4553 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) 4554 * { 4555 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; 4556 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); 4557 * *RdHi = (xxh_u32)(product >> 32); 4558 * } 4559 * 4560 * This instruction was designed for efficient long multiplication, and 4561 * allows this to be calculated in only 4 instructions at speeds 4562 * comparable to some 64-bit ALUs. 4563 * 4564 * 3. It isn't terrible on other platforms. Usually this will be a couple 4565 * of 32-bit ADD/ADCs. 4566 */ 4567 4568 /* First calculate all of the cross products. */ 4569 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); 4570 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); 4571 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); 4572 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); 4573 4574 /* Now add the products together. These will never overflow. */ 4575 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; 4576 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; 4577 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); 4578 4579 XXH128_hash_t r128; 4580 r128.low64 = lower; 4581 r128.high64 = upper; 4582 return r128; 4583 #endif 4584 } 4585 4586 /*! 4587 * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. 4588 * 4589 * The reason for the separate function is to prevent passing too many structs 4590 * around by value. This will hopefully inline the multiply, but we don't force it. 4591 * 4592 * @param lhs , rhs The 64-bit integers to multiply 4593 * @return The low 64 bits of the product XOR'd by the high 64 bits. 4594 * @see XXH_mult64to128() 4595 */ 4596 static xxh_u64 4597 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) 4598 { 4599 XXH128_hash_t product = XXH_mult64to128(lhs, rhs); 4600 return product.low64 ^ product.high64; 4601 } 4602 4603 /*! Seems to produce slightly better code on GCC for some reason. */ 4604 XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) 4605 { 4606 XXH_ASSERT(0 <= shift && shift < 64); 4607 return v64 ^ (v64 >> shift); 4608 } 4609 4610 /* 4611 * This is a fast avalanche stage, 4612 * suitable when input bits are already partially mixed 4613 */ 4614 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) 4615 { 4616 h64 = XXH_xorshift64(h64, 37); 4617 h64 *= PRIME_MX1; 4618 h64 = XXH_xorshift64(h64, 32); 4619 return h64; 4620 } 4621 4622 /* 4623 * This is a stronger avalanche, 4624 * inspired by Pelle Evensen's rrmxmx 4625 * preferable when input has not been previously mixed 4626 */ 4627 static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) 4628 { 4629 /* this mix is inspired by Pelle Evensen's rrmxmx */ 4630 h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); 4631 h64 *= PRIME_MX2; 4632 h64 ^= (h64 >> 35) + len ; 4633 h64 *= PRIME_MX2; 4634 return XXH_xorshift64(h64, 28); 4635 } 4636 4637 4638 /* ========================================== 4639 * Short keys 4640 * ========================================== 4641 * One of the shortcomings of XXH32 and XXH64 was that their performance was 4642 * sub-optimal on short lengths. It used an iterative algorithm which strongly 4643 * favored lengths that were a multiple of 4 or 8. 4644 * 4645 * Instead of iterating over individual inputs, we use a set of single shot 4646 * functions which piece together a range of lengths and operate in constant time. 4647 * 4648 * Additionally, the number of multiplies has been significantly reduced. This 4649 * reduces latency, especially when emulating 64-bit multiplies on 32-bit. 4650 * 4651 * Depending on the platform, this may or may not be faster than XXH32, but it 4652 * is almost guaranteed to be faster than XXH64. 4653 */ 4654 4655 /* 4656 * At very short lengths, there isn't enough input to fully hide secrets, or use 4657 * the entire secret. 4658 * 4659 * There is also only a limited amount of mixing we can do before significantly 4660 * impacting performance. 4661 * 4662 * Therefore, we use different sections of the secret and always mix two secret 4663 * samples with an XOR. This should have no effect on performance on the 4664 * seedless or withSeed variants because everything _should_ be constant folded 4665 * by modern compilers. 4666 * 4667 * The XOR mixing hides individual parts of the secret and increases entropy. 4668 * 4669 * This adds an extra layer of strength for custom secrets. 4670 */ 4671 XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t 4672 XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 4673 { 4674 XXH_ASSERT(input != NULL); 4675 XXH_ASSERT(1 <= len && len <= 3); 4676 XXH_ASSERT(secret != NULL); 4677 /* 4678 * len = 1: combined = { input[0], 0x01, input[0], input[0] } 4679 * len = 2: combined = { input[1], 0x02, input[0], input[1] } 4680 * len = 3: combined = { input[2], 0x03, input[0], input[1] } 4681 */ 4682 { xxh_u8 const c1 = input[0]; 4683 xxh_u8 const c2 = input[len >> 1]; 4684 xxh_u8 const c3 = input[len - 1]; 4685 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) 4686 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 4687 xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; 4688 xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; 4689 return XXH64_avalanche(keyed); 4690 } 4691 } 4692 4693 XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t 4694 XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 4695 { 4696 XXH_ASSERT(input != NULL); 4697 XXH_ASSERT(secret != NULL); 4698 XXH_ASSERT(4 <= len && len <= 8); 4699 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 4700 { xxh_u32 const input1 = XXH_readLE32(input); 4701 xxh_u32 const input2 = XXH_readLE32(input + len - 4); 4702 xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; 4703 xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); 4704 xxh_u64 const keyed = input64 ^ bitflip; 4705 return XXH3_rrmxmx(keyed, len); 4706 } 4707 } 4708 4709 XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t 4710 XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 4711 { 4712 XXH_ASSERT(input != NULL); 4713 XXH_ASSERT(secret != NULL); 4714 XXH_ASSERT(9 <= len && len <= 16); 4715 { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; 4716 xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; 4717 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; 4718 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; 4719 xxh_u64 const acc = len 4720 + XXH_swap64(input_lo) + input_hi 4721 + XXH3_mul128_fold64(input_lo, input_hi); 4722 return XXH3_avalanche(acc); 4723 } 4724 } 4725 4726 XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t 4727 XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 4728 { 4729 XXH_ASSERT(len <= 16); 4730 { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); 4731 if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); 4732 if (len) return XXH3_len_1to3_64b(input, len, secret, seed); 4733 return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); 4734 } 4735 } 4736 4737 /* 4738 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to 4739 * multiplication by zero, affecting hashes of lengths 17 to 240. 4740 * 4741 * However, they are very unlikely. 4742 * 4743 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all 4744 * unseeded non-cryptographic hashes, it does not attempt to defend itself 4745 * against specially crafted inputs, only random inputs. 4746 * 4747 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes 4748 * cancelling out the secret is taken an arbitrary number of times (addressed 4749 * in XXH3_accumulate_512), this collision is very unlikely with random inputs 4750 * and/or proper seeding: 4751 * 4752 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a 4753 * function that is only called up to 16 times per hash with up to 240 bytes of 4754 * input. 4755 * 4756 * This is not too bad for a non-cryptographic hash function, especially with 4757 * only 64 bit outputs. 4758 * 4759 * The 128-bit variant (which trades some speed for strength) is NOT affected 4760 * by this, although it is always a good idea to use a proper seed if you care 4761 * about strength. 4762 */ 4763 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, 4764 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) 4765 { 4766 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 4767 && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ 4768 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ 4769 /* 4770 * UGLY HACK: 4771 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in 4772 * slower code. 4773 * 4774 * By forcing seed64 into a register, we disrupt the cost model and 4775 * cause it to scalarize. See `XXH32_round()` 4776 * 4777 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, 4778 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on 4779 * GCC 9.2, despite both emitting scalar code. 4780 * 4781 * GCC generates much better scalar code than Clang for the rest of XXH3, 4782 * which is why finding a more optimal codepath is an interest. 4783 */ 4784 XXH_COMPILER_GUARD(seed64); 4785 #endif 4786 { xxh_u64 const input_lo = XXH_readLE64(input); 4787 xxh_u64 const input_hi = XXH_readLE64(input+8); 4788 return XXH3_mul128_fold64( 4789 input_lo ^ (XXH_readLE64(secret) + seed64), 4790 input_hi ^ (XXH_readLE64(secret+8) - seed64) 4791 ); 4792 } 4793 } 4794 4795 /* For mid range keys, XXH3 uses a Mum-hash variant. */ 4796 XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t 4797 XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, 4798 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 4799 XXH64_hash_t seed) 4800 { 4801 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 4802 XXH_ASSERT(16 < len && len <= 128); 4803 4804 { xxh_u64 acc = len * XXH_PRIME64_1; 4805 #if XXH_SIZE_OPT >= 1 4806 /* Smaller and cleaner, but slightly slower. */ 4807 unsigned int i = (unsigned int)(len - 1) / 32; 4808 do { 4809 acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); 4810 acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); 4811 } while (i-- != 0); 4812 #else 4813 if (len > 32) { 4814 if (len > 64) { 4815 if (len > 96) { 4816 acc += XXH3_mix16B(input+48, secret+96, seed); 4817 acc += XXH3_mix16B(input+len-64, secret+112, seed); 4818 } 4819 acc += XXH3_mix16B(input+32, secret+64, seed); 4820 acc += XXH3_mix16B(input+len-48, secret+80, seed); 4821 } 4822 acc += XXH3_mix16B(input+16, secret+32, seed); 4823 acc += XXH3_mix16B(input+len-32, secret+48, seed); 4824 } 4825 acc += XXH3_mix16B(input+0, secret+0, seed); 4826 acc += XXH3_mix16B(input+len-16, secret+16, seed); 4827 #endif 4828 return XXH3_avalanche(acc); 4829 } 4830 } 4831 4832 XXH_NO_INLINE XXH_PUREF XXH64_hash_t 4833 XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, 4834 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 4835 XXH64_hash_t seed) 4836 { 4837 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 4838 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 4839 4840 #define XXH3_MIDSIZE_STARTOFFSET 3 4841 #define XXH3_MIDSIZE_LASTOFFSET 17 4842 4843 { xxh_u64 acc = len * XXH_PRIME64_1; 4844 xxh_u64 acc_end; 4845 unsigned int const nbRounds = (unsigned int)len / 16; 4846 unsigned int i; 4847 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 4848 for (i=0; i<8; i++) { 4849 acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); 4850 } 4851 /* last bytes */ 4852 acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); 4853 XXH_ASSERT(nbRounds >= 8); 4854 acc = XXH3_avalanche(acc); 4855 #if defined(__clang__) /* Clang */ \ 4856 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 4857 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 4858 /* 4859 * UGLY HACK: 4860 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. 4861 * In everywhere else, it uses scalar code. 4862 * 4863 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it 4864 * would still be slower than UMAAL (see XXH_mult64to128). 4865 * 4866 * Unfortunately, Clang doesn't handle the long multiplies properly and 4867 * converts them to the nonexistent "vmulq_u64" intrinsic, which is then 4868 * scalarized into an ugly mess of VMOV.32 instructions. 4869 * 4870 * This mess is difficult to avoid without turning autovectorization 4871 * off completely, but they are usually relatively minor and/or not 4872 * worth it to fix. 4873 * 4874 * This loop is the easiest to fix, as unlike XXH32, this pragma 4875 * _actually works_ because it is a loop vectorization instead of an 4876 * SLP vectorization. 4877 */ 4878 #pragma clang loop vectorize(disable) 4879 #endif 4880 for (i=8 ; i < nbRounds; i++) { 4881 /* 4882 * Prevents clang for unrolling the acc loop and interleaving with this one. 4883 */ 4884 XXH_COMPILER_GUARD(acc); 4885 acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); 4886 } 4887 return XXH3_avalanche(acc + acc_end); 4888 } 4889 } 4890 4891 4892 /* ======= Long Keys ======= */ 4893 4894 #define XXH_STRIPE_LEN 64 4895 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ 4896 #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) 4897 4898 #ifdef XXH_OLD_NAMES 4899 # define STRIPE_LEN XXH_STRIPE_LEN 4900 # define ACC_NB XXH_ACC_NB 4901 #endif 4902 4903 #ifndef XXH_PREFETCH_DIST 4904 # ifdef __clang__ 4905 # define XXH_PREFETCH_DIST 320 4906 # else 4907 # if (XXH_VECTOR == XXH_AVX512) 4908 # define XXH_PREFETCH_DIST 512 4909 # else 4910 # define XXH_PREFETCH_DIST 384 4911 # endif 4912 # endif /* __clang__ */ 4913 #endif /* XXH_PREFETCH_DIST */ 4914 4915 /* 4916 * These macros are to generate an XXH3_accumulate() function. 4917 * The two arguments select the name suffix and target attribute. 4918 * 4919 * The name of this symbol is XXH3_accumulate_<name>() and it calls 4920 * XXH3_accumulate_512_<name>(). 4921 * 4922 * It may be useful to hand implement this function if the compiler fails to 4923 * optimize the inline function. 4924 */ 4925 #define XXH3_ACCUMULATE_TEMPLATE(name) \ 4926 void \ 4927 XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ 4928 const xxh_u8* XXH_RESTRICT input, \ 4929 const xxh_u8* XXH_RESTRICT secret, \ 4930 size_t nbStripes) \ 4931 { \ 4932 size_t n; \ 4933 for (n = 0; n < nbStripes; n++ ) { \ 4934 const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ 4935 XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ 4936 XXH3_accumulate_512_##name( \ 4937 acc, \ 4938 in, \ 4939 secret + n*XXH_SECRET_CONSUME_RATE); \ 4940 } \ 4941 } 4942 4943 4944 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) 4945 { 4946 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); 4947 XXH_memcpy(dst, &v64, sizeof(v64)); 4948 } 4949 4950 /* Several intrinsic functions below are supposed to accept __int64 as argument, 4951 * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . 4952 * However, several environments do not define __int64 type, 4953 * requiring a workaround. 4954 */ 4955 #if !defined (__VMS) \ 4956 && (defined (__cplusplus) \ 4957 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) 4958 typedef int64_t xxh_i64; 4959 #else 4960 /* the following type must have a width of 64-bit */ 4961 typedef long long xxh_i64; 4962 #endif 4963 4964 4965 /* 4966 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. 4967 * 4968 * It is a hardened version of UMAC, based off of FARSH's implementation. 4969 * 4970 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD 4971 * implementations, and it is ridiculously fast. 4972 * 4973 * We harden it by mixing the original input to the accumulators as well as the product. 4974 * 4975 * This means that in the (relatively likely) case of a multiply by zero, the 4976 * original input is preserved. 4977 * 4978 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve 4979 * cross-pollination, as otherwise the upper and lower halves would be 4980 * essentially independent. 4981 * 4982 * This doesn't matter on 64-bit hashes since they all get merged together in 4983 * the end, so we skip the extra step. 4984 * 4985 * Both XXH3_64bits and XXH3_128bits use this subroutine. 4986 */ 4987 4988 #if (XXH_VECTOR == XXH_AVX512) \ 4989 || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) 4990 4991 #ifndef XXH_TARGET_AVX512 4992 # define XXH_TARGET_AVX512 /* disable attribute target */ 4993 #endif 4994 4995 XXH_FORCE_INLINE XXH_TARGET_AVX512 void 4996 XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, 4997 const void* XXH_RESTRICT input, 4998 const void* XXH_RESTRICT secret) 4999 { 5000 __m512i* const xacc = (__m512i *) acc; 5001 XXH_ASSERT((((size_t)acc) & 63) == 0); 5002 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 5003 5004 { 5005 /* data_vec = input[0]; */ 5006 __m512i const data_vec = _mm512_loadu_si512 (input); 5007 /* key_vec = secret[0]; */ 5008 __m512i const key_vec = _mm512_loadu_si512 (secret); 5009 /* data_key = data_vec ^ key_vec; */ 5010 __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); 5011 /* data_key_lo = data_key >> 32; */ 5012 __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); 5013 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 5014 __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); 5015 /* xacc[0] += swap(data_vec); */ 5016 __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); 5017 __m512i const sum = _mm512_add_epi64(*xacc, data_swap); 5018 /* xacc[0] += product; */ 5019 *xacc = _mm512_add_epi64(product, sum); 5020 } 5021 } 5022 XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) 5023 5024 /* 5025 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. 5026 * 5027 * Multiplication isn't perfect, as explained by Google in HighwayHash: 5028 * 5029 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to 5030 * // varying degrees. In descending order of goodness, bytes 5031 * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. 5032 * // As expected, the upper and lower bytes are much worse. 5033 * 5034 * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 5035 * 5036 * Since our algorithm uses a pseudorandom secret to add some variance into the 5037 * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. 5038 * 5039 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid 5040 * extraction. 5041 * 5042 * Both XXH3_64bits and XXH3_128bits use this subroutine. 5043 */ 5044 5045 XXH_FORCE_INLINE XXH_TARGET_AVX512 void 5046 XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5047 { 5048 XXH_ASSERT((((size_t)acc) & 63) == 0); 5049 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); 5050 { __m512i* const xacc = (__m512i*) acc; 5051 const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); 5052 5053 /* xacc[0] ^= (xacc[0] >> 47) */ 5054 __m512i const acc_vec = *xacc; 5055 __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); 5056 /* xacc[0] ^= secret; */ 5057 __m512i const key_vec = _mm512_loadu_si512 (secret); 5058 __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); 5059 5060 /* xacc[0] *= XXH_PRIME32_1; */ 5061 __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); 5062 __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); 5063 __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); 5064 *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); 5065 } 5066 } 5067 5068 XXH_FORCE_INLINE XXH_TARGET_AVX512 void 5069 XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) 5070 { 5071 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); 5072 XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); 5073 XXH_ASSERT(((size_t)customSecret & 63) == 0); 5074 (void)(&XXH_writeLE64); 5075 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); 5076 __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); 5077 __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); 5078 5079 const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); 5080 __m512i* const dest = ( __m512i*) customSecret; 5081 int i; 5082 XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ 5083 XXH_ASSERT(((size_t)dest & 63) == 0); 5084 for (i=0; i < nbRounds; ++i) { 5085 dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); 5086 } } 5087 } 5088 5089 #endif 5090 5091 #if (XXH_VECTOR == XXH_AVX2) \ 5092 || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) 5093 5094 #ifndef XXH_TARGET_AVX2 5095 # define XXH_TARGET_AVX2 /* disable attribute target */ 5096 #endif 5097 5098 XXH_FORCE_INLINE XXH_TARGET_AVX2 void 5099 XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, 5100 const void* XXH_RESTRICT input, 5101 const void* XXH_RESTRICT secret) 5102 { 5103 XXH_ASSERT((((size_t)acc) & 31) == 0); 5104 { __m256i* const xacc = (__m256i *) acc; 5105 /* Unaligned. This is mainly for pointer arithmetic, and because 5106 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 5107 const __m256i* const xinput = (const __m256i *) input; 5108 /* Unaligned. This is mainly for pointer arithmetic, and because 5109 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 5110 const __m256i* const xsecret = (const __m256i *) secret; 5111 5112 size_t i; 5113 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { 5114 /* data_vec = xinput[i]; */ 5115 __m256i const data_vec = _mm256_loadu_si256 (xinput+i); 5116 /* key_vec = xsecret[i]; */ 5117 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); 5118 /* data_key = data_vec ^ key_vec; */ 5119 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); 5120 /* data_key_lo = data_key >> 32; */ 5121 __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); 5122 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 5123 __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); 5124 /* xacc[i] += swap(data_vec); */ 5125 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); 5126 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); 5127 /* xacc[i] += product; */ 5128 xacc[i] = _mm256_add_epi64(product, sum); 5129 } } 5130 } 5131 XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) 5132 5133 XXH_FORCE_INLINE XXH_TARGET_AVX2 void 5134 XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5135 { 5136 XXH_ASSERT((((size_t)acc) & 31) == 0); 5137 { __m256i* const xacc = (__m256i*) acc; 5138 /* Unaligned. This is mainly for pointer arithmetic, and because 5139 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ 5140 const __m256i* const xsecret = (const __m256i *) secret; 5141 const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); 5142 5143 size_t i; 5144 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { 5145 /* xacc[i] ^= (xacc[i] >> 47) */ 5146 __m256i const acc_vec = xacc[i]; 5147 __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); 5148 __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); 5149 /* xacc[i] ^= xsecret; */ 5150 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); 5151 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); 5152 5153 /* xacc[i] *= XXH_PRIME32_1; */ 5154 __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); 5155 __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); 5156 __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); 5157 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); 5158 } 5159 } 5160 } 5161 5162 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) 5163 { 5164 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); 5165 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); 5166 XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); 5167 (void)(&XXH_writeLE64); 5168 XXH_PREFETCH(customSecret); 5169 { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); 5170 5171 const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); 5172 __m256i* dest = ( __m256i*) customSecret; 5173 5174 # if defined(__GNUC__) || defined(__clang__) 5175 /* 5176 * On GCC & Clang, marking 'dest' as modified will cause the compiler: 5177 * - do not extract the secret from sse registers in the internal loop 5178 * - use less common registers, and avoid pushing these reg into stack 5179 */ 5180 XXH_COMPILER_GUARD(dest); 5181 # endif 5182 XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ 5183 XXH_ASSERT(((size_t)dest & 31) == 0); 5184 5185 /* GCC -O2 need unroll loop manually */ 5186 dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); 5187 dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); 5188 dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); 5189 dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); 5190 dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); 5191 dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); 5192 } 5193 } 5194 5195 #endif 5196 5197 /* x86dispatch always generates SSE2 */ 5198 #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) 5199 5200 #ifndef XXH_TARGET_SSE2 5201 # define XXH_TARGET_SSE2 /* disable attribute target */ 5202 #endif 5203 5204 XXH_FORCE_INLINE XXH_TARGET_SSE2 void 5205 XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, 5206 const void* XXH_RESTRICT input, 5207 const void* XXH_RESTRICT secret) 5208 { 5209 /* SSE2 is just a half-scale version of the AVX2 version. */ 5210 XXH_ASSERT((((size_t)acc) & 15) == 0); 5211 { __m128i* const xacc = (__m128i *) acc; 5212 /* Unaligned. This is mainly for pointer arithmetic, and because 5213 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 5214 const __m128i* const xinput = (const __m128i *) input; 5215 /* Unaligned. This is mainly for pointer arithmetic, and because 5216 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 5217 const __m128i* const xsecret = (const __m128i *) secret; 5218 5219 size_t i; 5220 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { 5221 /* data_vec = xinput[i]; */ 5222 __m128i const data_vec = _mm_loadu_si128 (xinput+i); 5223 /* key_vec = xsecret[i]; */ 5224 __m128i const key_vec = _mm_loadu_si128 (xsecret+i); 5225 /* data_key = data_vec ^ key_vec; */ 5226 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); 5227 /* data_key_lo = data_key >> 32; */ 5228 __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 5229 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 5230 __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); 5231 /* xacc[i] += swap(data_vec); */ 5232 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); 5233 __m128i const sum = _mm_add_epi64(xacc[i], data_swap); 5234 /* xacc[i] += product; */ 5235 xacc[i] = _mm_add_epi64(product, sum); 5236 } } 5237 } 5238 XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) 5239 5240 XXH_FORCE_INLINE XXH_TARGET_SSE2 void 5241 XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5242 { 5243 XXH_ASSERT((((size_t)acc) & 15) == 0); 5244 { __m128i* const xacc = (__m128i*) acc; 5245 /* Unaligned. This is mainly for pointer arithmetic, and because 5246 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ 5247 const __m128i* const xsecret = (const __m128i *) secret; 5248 const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); 5249 5250 size_t i; 5251 for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { 5252 /* xacc[i] ^= (xacc[i] >> 47) */ 5253 __m128i const acc_vec = xacc[i]; 5254 __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); 5255 __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); 5256 /* xacc[i] ^= xsecret[i]; */ 5257 __m128i const key_vec = _mm_loadu_si128 (xsecret+i); 5258 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); 5259 5260 /* xacc[i] *= XXH_PRIME32_1; */ 5261 __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); 5262 __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); 5263 __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); 5264 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); 5265 } 5266 } 5267 } 5268 5269 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) 5270 { 5271 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); 5272 (void)(&XXH_writeLE64); 5273 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); 5274 5275 # if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER <= 1900 5276 /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 5277 * and some specific variants of 2015 may also lack it */ 5278 /* Cast to unsigned 64-bit first to avoid signed arithmetic issues */ 5279 xxh_u64 const seed64_unsigned = (xxh_u64)seed64; 5280 xxh_u64 const neg_seed64 = (xxh_u64)(0ULL - seed64_unsigned); 5281 __m128i const seed = _mm_set_epi32( 5282 (int)(neg_seed64 >> 32), /* high 32 bits of negated seed */ 5283 (int)(neg_seed64), /* low 32 bits of negated seed */ 5284 (int)(seed64_unsigned >> 32), /* high 32 bits of original seed */ 5285 (int)(seed64_unsigned) /* low 32 bits of original seed */ 5286 ); 5287 # else 5288 __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); 5289 # endif 5290 int i; 5291 5292 const void* const src16 = XXH3_kSecret; 5293 __m128i* dst16 = (__m128i*) customSecret; 5294 # if defined(__GNUC__) || defined(__clang__) 5295 /* 5296 * On GCC & Clang, marking 'dest' as modified will cause the compiler: 5297 * - do not extract the secret from sse registers in the internal loop 5298 * - use less common registers, and avoid pushing these reg into stack 5299 */ 5300 XXH_COMPILER_GUARD(dst16); 5301 # endif 5302 XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ 5303 XXH_ASSERT(((size_t)dst16 & 15) == 0); 5304 5305 for (i=0; i < nbRounds; ++i) { 5306 dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); 5307 } } 5308 } 5309 5310 #endif 5311 5312 #if (XXH_VECTOR == XXH_NEON) 5313 5314 /* forward declarations for the scalar routines */ 5315 XXH_FORCE_INLINE void 5316 XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, 5317 void const* XXH_RESTRICT secret, size_t lane); 5318 5319 XXH_FORCE_INLINE void 5320 XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, 5321 void const* XXH_RESTRICT secret, size_t lane); 5322 5323 /*! 5324 * @internal 5325 * @brief The bulk processing loop for NEON and WASM SIMD128. 5326 * 5327 * The NEON code path is actually partially scalar when running on AArch64. This 5328 * is to optimize the pipelining and can have up to 15% speedup depending on the 5329 * CPU, and it also mitigates some GCC codegen issues. 5330 * 5331 * @see XXH3_NEON_LANES for configuring this and details about this optimization. 5332 * 5333 * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit 5334 * integers instead of the other platforms which mask full 64-bit vectors, 5335 * so the setup is more complicated than just shifting right. 5336 * 5337 * Additionally, there is an optimization for 4 lanes at once noted below. 5338 * 5339 * Since, as stated, the most optimal amount of lanes for Cortexes is 6, 5340 * there needs to be *three* versions of the accumulate operation used 5341 * for the remaining 2 lanes. 5342 * 5343 * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap 5344 * nearly perfectly. 5345 */ 5346 5347 XXH_FORCE_INLINE void 5348 XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, 5349 const void* XXH_RESTRICT input, 5350 const void* XXH_RESTRICT secret) 5351 { 5352 XXH_ASSERT((((size_t)acc) & 15) == 0); 5353 XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); 5354 { /* GCC for darwin arm64 does not like aliasing here */ 5355 xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; 5356 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ 5357 uint8_t const* xinput = (const uint8_t *) input; 5358 uint8_t const* xsecret = (const uint8_t *) secret; 5359 5360 size_t i; 5361 #ifdef __wasm_simd128__ 5362 /* 5363 * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret 5364 * is constant propagated, which results in it converting it to this 5365 * inside the loop: 5366 * 5367 * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) 5368 * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) 5369 * ... 5370 * 5371 * This requires a full 32-bit address immediate (and therefore a 6 byte 5372 * instruction) as well as an add for each offset. 5373 * 5374 * Putting an asm guard prevents it from folding (at the cost of losing 5375 * the alignment hint), and uses the free offset in `v128.load` instead 5376 * of adding secret_offset each time which overall reduces code size by 5377 * about a kilobyte and improves performance. 5378 */ 5379 XXH_COMPILER_GUARD(xsecret); 5380 #endif 5381 /* Scalar lanes use the normal scalarRound routine */ 5382 for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { 5383 XXH3_scalarRound(acc, input, secret, i); 5384 } 5385 i = 0; 5386 /* 4 NEON lanes at a time. */ 5387 for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { 5388 /* data_vec = xinput[i]; */ 5389 uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); 5390 uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); 5391 /* key_vec = xsecret[i]; */ 5392 uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); 5393 uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); 5394 /* data_swap = swap(data_vec) */ 5395 uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); 5396 uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); 5397 /* data_key = data_vec ^ key_vec; */ 5398 uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); 5399 uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); 5400 5401 /* 5402 * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a 5403 * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to 5404 * get one vector with the low 32 bits of each lane, and one vector 5405 * with the high 32 bits of each lane. 5406 * 5407 * The intrinsic returns a double vector because the original ARMv7-a 5408 * instruction modified both arguments in place. AArch64 and SIMD128 emit 5409 * two instructions from this intrinsic. 5410 * 5411 * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] 5412 * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] 5413 */ 5414 uint32x4x2_t unzipped = vuzpq_u32( 5415 vreinterpretq_u32_u64(data_key_1), 5416 vreinterpretq_u32_u64(data_key_2) 5417 ); 5418 /* data_key_lo = data_key & 0xFFFFFFFF */ 5419 uint32x4_t data_key_lo = unzipped.val[0]; 5420 /* data_key_hi = data_key >> 32 */ 5421 uint32x4_t data_key_hi = unzipped.val[1]; 5422 /* 5423 * Then, we can split the vectors horizontally and multiply which, as for most 5424 * widening intrinsics, have a variant that works on both high half vectors 5425 * for free on AArch64. A similar instruction is available on SIMD128. 5426 * 5427 * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi 5428 */ 5429 uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); 5430 uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); 5431 /* 5432 * Clang reorders 5433 * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s 5434 * c += a; // add acc.2d, acc.2d, swap.2d 5435 * to 5436 * c += a; // add acc.2d, acc.2d, swap.2d 5437 * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s 5438 * 5439 * While it would make sense in theory since the addition is faster, 5440 * for reasons likely related to umlal being limited to certain NEON 5441 * pipelines, this is worse. A compiler guard fixes this. 5442 */ 5443 XXH_COMPILER_GUARD_CLANG_NEON(sum_1); 5444 XXH_COMPILER_GUARD_CLANG_NEON(sum_2); 5445 /* xacc[i] = acc_vec + sum; */ 5446 xacc[i] = vaddq_u64(xacc[i], sum_1); 5447 xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); 5448 } 5449 /* Operate on the remaining NEON lanes 2 at a time. */ 5450 for (; i < XXH3_NEON_LANES / 2; i++) { 5451 /* data_vec = xinput[i]; */ 5452 uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); 5453 /* key_vec = xsecret[i]; */ 5454 uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); 5455 /* acc_vec_2 = swap(data_vec) */ 5456 uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); 5457 /* data_key = data_vec ^ key_vec; */ 5458 uint64x2_t data_key = veorq_u64(data_vec, key_vec); 5459 /* For two lanes, just use VMOVN and VSHRN. */ 5460 /* data_key_lo = data_key & 0xFFFFFFFF; */ 5461 uint32x2_t data_key_lo = vmovn_u64(data_key); 5462 /* data_key_hi = data_key >> 32; */ 5463 uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); 5464 /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ 5465 uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); 5466 /* Same Clang workaround as before */ 5467 XXH_COMPILER_GUARD_CLANG_NEON(sum); 5468 /* xacc[i] = acc_vec + sum; */ 5469 xacc[i] = vaddq_u64 (xacc[i], sum); 5470 } 5471 } 5472 } 5473 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) 5474 5475 XXH_FORCE_INLINE void 5476 XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5477 { 5478 XXH_ASSERT((((size_t)acc) & 15) == 0); 5479 5480 { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; 5481 uint8_t const* xsecret = (uint8_t const*) secret; 5482 5483 size_t i; 5484 /* WASM uses operator overloads and doesn't need these. */ 5485 #ifndef __wasm_simd128__ 5486 /* { prime32_1, prime32_1 } */ 5487 uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); 5488 /* { 0, prime32_1, 0, prime32_1 } */ 5489 uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); 5490 #endif 5491 5492 /* AArch64 uses both scalar and neon at the same time */ 5493 for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { 5494 XXH3_scalarScrambleRound(acc, secret, i); 5495 } 5496 for (i=0; i < XXH3_NEON_LANES / 2; i++) { 5497 /* xacc[i] ^= (xacc[i] >> 47); */ 5498 uint64x2_t acc_vec = xacc[i]; 5499 uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); 5500 uint64x2_t data_vec = veorq_u64(acc_vec, shifted); 5501 5502 /* xacc[i] ^= xsecret[i]; */ 5503 uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); 5504 uint64x2_t data_key = veorq_u64(data_vec, key_vec); 5505 /* xacc[i] *= XXH_PRIME32_1 */ 5506 #ifdef __wasm_simd128__ 5507 /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ 5508 xacc[i] = data_key * XXH_PRIME32_1; 5509 #else 5510 /* 5511 * Expanded version with portable NEON intrinsics 5512 * 5513 * lo(x) * lo(y) + (hi(x) * lo(y) << 32) 5514 * 5515 * prod_hi = hi(data_key) * lo(prime) << 32 5516 * 5517 * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector 5518 * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits 5519 * and avoid the shift. 5520 */ 5521 uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); 5522 /* Extract low bits for vmlal_u32 */ 5523 uint32x2_t data_key_lo = vmovn_u64(data_key); 5524 /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ 5525 xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); 5526 #endif 5527 } 5528 } 5529 } 5530 #endif 5531 5532 #if (XXH_VECTOR == XXH_VSX) 5533 5534 XXH_FORCE_INLINE void 5535 XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, 5536 const void* XXH_RESTRICT input, 5537 const void* XXH_RESTRICT secret) 5538 { 5539 /* presumed aligned */ 5540 xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; 5541 xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ 5542 xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ 5543 xxh_u64x2 const v32 = { 32, 32 }; 5544 size_t i; 5545 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 5546 /* data_vec = xinput[i]; */ 5547 xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); 5548 /* key_vec = xsecret[i]; */ 5549 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); 5550 xxh_u64x2 const data_key = data_vec ^ key_vec; 5551 /* shuffled = (data_key << 32) | (data_key >> 32); */ 5552 xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); 5553 /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ 5554 xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); 5555 /* acc_vec = xacc[i]; */ 5556 xxh_u64x2 acc_vec = xacc[i]; 5557 acc_vec += product; 5558 5559 /* swap high and low halves */ 5560 #ifdef __s390x__ 5561 acc_vec += vec_permi(data_vec, data_vec, 2); 5562 #else 5563 acc_vec += vec_xxpermdi(data_vec, data_vec, 2); 5564 #endif 5565 xacc[i] = acc_vec; 5566 } 5567 } 5568 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) 5569 5570 XXH_FORCE_INLINE void 5571 XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5572 { 5573 XXH_ASSERT((((size_t)acc) & 15) == 0); 5574 5575 { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; 5576 const xxh_u8* const xsecret = (const xxh_u8*) secret; 5577 /* constants */ 5578 xxh_u64x2 const v32 = { 32, 32 }; 5579 xxh_u64x2 const v47 = { 47, 47 }; 5580 xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; 5581 size_t i; 5582 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { 5583 /* xacc[i] ^= (xacc[i] >> 47); */ 5584 xxh_u64x2 const acc_vec = xacc[i]; 5585 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); 5586 5587 /* xacc[i] ^= xsecret[i]; */ 5588 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); 5589 xxh_u64x2 const data_key = data_vec ^ key_vec; 5590 5591 /* xacc[i] *= XXH_PRIME32_1 */ 5592 /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ 5593 xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); 5594 /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ 5595 xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); 5596 xacc[i] = prod_odd + (prod_even << v32); 5597 } } 5598 } 5599 5600 #endif 5601 5602 #if (XXH_VECTOR == XXH_SVE) 5603 5604 XXH_FORCE_INLINE void 5605 XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, 5606 const void* XXH_RESTRICT input, 5607 const void* XXH_RESTRICT secret) 5608 { 5609 uint64_t *xacc = (uint64_t *)acc; 5610 const uint64_t *xinput = (const uint64_t *)(const void *)input; 5611 const uint64_t *xsecret = (const uint64_t *)(const void *)secret; 5612 svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); 5613 uint64_t element_count = svcntd(); 5614 if (element_count >= 8) { 5615 svbool_t mask = svptrue_pat_b64(SV_VL8); 5616 svuint64_t vacc = svld1_u64(mask, xacc); 5617 ACCRND(vacc, 0); 5618 svst1_u64(mask, xacc, vacc); 5619 } else if (element_count == 2) { /* sve128 */ 5620 svbool_t mask = svptrue_pat_b64(SV_VL2); 5621 svuint64_t acc0 = svld1_u64(mask, xacc + 0); 5622 svuint64_t acc1 = svld1_u64(mask, xacc + 2); 5623 svuint64_t acc2 = svld1_u64(mask, xacc + 4); 5624 svuint64_t acc3 = svld1_u64(mask, xacc + 6); 5625 ACCRND(acc0, 0); 5626 ACCRND(acc1, 2); 5627 ACCRND(acc2, 4); 5628 ACCRND(acc3, 6); 5629 svst1_u64(mask, xacc + 0, acc0); 5630 svst1_u64(mask, xacc + 2, acc1); 5631 svst1_u64(mask, xacc + 4, acc2); 5632 svst1_u64(mask, xacc + 6, acc3); 5633 } else { 5634 svbool_t mask = svptrue_pat_b64(SV_VL4); 5635 svuint64_t acc0 = svld1_u64(mask, xacc + 0); 5636 svuint64_t acc1 = svld1_u64(mask, xacc + 4); 5637 ACCRND(acc0, 0); 5638 ACCRND(acc1, 4); 5639 svst1_u64(mask, xacc + 0, acc0); 5640 svst1_u64(mask, xacc + 4, acc1); 5641 } 5642 } 5643 5644 XXH_FORCE_INLINE void 5645 XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, 5646 const xxh_u8* XXH_RESTRICT input, 5647 const xxh_u8* XXH_RESTRICT secret, 5648 size_t nbStripes) 5649 { 5650 if (nbStripes != 0) { 5651 uint64_t *xacc = (uint64_t *)acc; 5652 const uint64_t *xinput = (const uint64_t *)(const void *)input; 5653 const uint64_t *xsecret = (const uint64_t *)(const void *)secret; 5654 svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); 5655 uint64_t element_count = svcntd(); 5656 if (element_count >= 8) { 5657 svbool_t mask = svptrue_pat_b64(SV_VL8); 5658 svuint64_t vacc = svld1_u64(mask, xacc + 0); 5659 do { 5660 /* svprfd(svbool_t, void *, enum svfprop); */ 5661 svprfd(mask, xinput + 128, SV_PLDL1STRM); 5662 ACCRND(vacc, 0); 5663 xinput += 8; 5664 xsecret += 1; 5665 nbStripes--; 5666 } while (nbStripes != 0); 5667 5668 svst1_u64(mask, xacc + 0, vacc); 5669 } else if (element_count == 2) { /* sve128 */ 5670 svbool_t mask = svptrue_pat_b64(SV_VL2); 5671 svuint64_t acc0 = svld1_u64(mask, xacc + 0); 5672 svuint64_t acc1 = svld1_u64(mask, xacc + 2); 5673 svuint64_t acc2 = svld1_u64(mask, xacc + 4); 5674 svuint64_t acc3 = svld1_u64(mask, xacc + 6); 5675 do { 5676 svprfd(mask, xinput + 128, SV_PLDL1STRM); 5677 ACCRND(acc0, 0); 5678 ACCRND(acc1, 2); 5679 ACCRND(acc2, 4); 5680 ACCRND(acc3, 6); 5681 xinput += 8; 5682 xsecret += 1; 5683 nbStripes--; 5684 } while (nbStripes != 0); 5685 5686 svst1_u64(mask, xacc + 0, acc0); 5687 svst1_u64(mask, xacc + 2, acc1); 5688 svst1_u64(mask, xacc + 4, acc2); 5689 svst1_u64(mask, xacc + 6, acc3); 5690 } else { 5691 svbool_t mask = svptrue_pat_b64(SV_VL4); 5692 svuint64_t acc0 = svld1_u64(mask, xacc + 0); 5693 svuint64_t acc1 = svld1_u64(mask, xacc + 4); 5694 do { 5695 svprfd(mask, xinput + 128, SV_PLDL1STRM); 5696 ACCRND(acc0, 0); 5697 ACCRND(acc1, 4); 5698 xinput += 8; 5699 xsecret += 1; 5700 nbStripes--; 5701 } while (nbStripes != 0); 5702 5703 svst1_u64(mask, xacc + 0, acc0); 5704 svst1_u64(mask, xacc + 4, acc1); 5705 } 5706 } 5707 } 5708 5709 #endif 5710 5711 #if (XXH_VECTOR == XXH_LSX) 5712 #define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 5713 5714 XXH_FORCE_INLINE void 5715 XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc, 5716 const void* XXH_RESTRICT input, 5717 const void* XXH_RESTRICT secret) 5718 { 5719 XXH_ASSERT((((size_t)acc) & 15) == 0); 5720 { 5721 __m128i* const xacc = (__m128i *) acc; 5722 const __m128i* const xinput = (const __m128i *) input; 5723 const __m128i* const xsecret = (const __m128i *) secret; 5724 size_t i; 5725 5726 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { 5727 /* data_vec = xinput[i]; */ 5728 __m128i const data_vec = __lsx_vld(xinput + i, 0); 5729 /* key_vec = xsecret[i]; */ 5730 __m128i const key_vec = __lsx_vld(xsecret + i, 0); 5731 /* data_key = data_vec ^ key_vec; */ 5732 __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); 5733 /* data_key_lo = data_key >> 32; */ 5734 __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); 5735 // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); 5736 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 5737 __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo); 5738 /* xacc[i] += swap(data_vec); */ 5739 __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2)); 5740 __m128i const sum = __lsx_vadd_d(xacc[i], data_swap); 5741 /* xacc[i] += product; */ 5742 xacc[i] = __lsx_vadd_d(product, sum); 5743 } 5744 } 5745 } 5746 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx) 5747 5748 XXH_FORCE_INLINE void 5749 XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5750 { 5751 XXH_ASSERT((((size_t)acc) & 15) == 0); 5752 { 5753 __m128i* const xacc = (__m128i*) acc; 5754 const __m128i* const xsecret = (const __m128i *) secret; 5755 const __m128i prime32 = __lsx_vreplgr2vr_d(XXH_PRIME32_1); 5756 size_t i; 5757 5758 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { 5759 /* xacc[i] ^= (xacc[i] >> 47) */ 5760 __m128i const acc_vec = xacc[i]; 5761 __m128i const shifted = __lsx_vsrli_d(acc_vec, 47); 5762 __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted); 5763 /* xacc[i] ^= xsecret[i]; */ 5764 __m128i const key_vec = __lsx_vld(xsecret + i, 0); 5765 __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); 5766 5767 /* xacc[i] *= XXH_PRIME32_1; */ 5768 xacc[i] = __lsx_vmul_d(data_key, prime32); 5769 } 5770 } 5771 } 5772 5773 #endif 5774 5775 #if (XXH_VECTOR == XXH_LASX) 5776 #define _LASX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 5777 5778 XXH_FORCE_INLINE void 5779 XXH3_accumulate_512_lasx( void* XXH_RESTRICT acc, 5780 const void* XXH_RESTRICT input, 5781 const void* XXH_RESTRICT secret) 5782 { 5783 XXH_ASSERT((((size_t)acc) & 31) == 0); 5784 { 5785 size_t i; 5786 __m256i* const xacc = (__m256i *) acc; 5787 const __m256i* const xinput = (const __m256i *) input; 5788 const __m256i* const xsecret = (const __m256i *) secret; 5789 5790 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { 5791 /* data_vec = xinput[i]; */ 5792 __m256i const data_vec = __lasx_xvld(xinput + i, 0); 5793 /* key_vec = xsecret[i]; */ 5794 __m256i const key_vec = __lasx_xvld(xsecret + i, 0); 5795 /* data_key = data_vec ^ key_vec; */ 5796 __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); 5797 /* data_key_lo = data_key >> 32; */ 5798 __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); 5799 // __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32); 5800 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ 5801 __m256i const product = __lasx_xvmulwev_d_wu(data_key, data_key_lo); 5802 /* xacc[i] += swap(data_vec); */ 5803 __m256i const data_swap = __lasx_xvshuf4i_w(data_vec, _LASX_SHUFFLE(1, 0, 3, 2)); 5804 __m256i const sum = __lasx_xvadd_d(xacc[i], data_swap); 5805 /* xacc[i] += product; */ 5806 xacc[i] = __lasx_xvadd_d(product, sum); 5807 } 5808 } 5809 } 5810 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lasx) 5811 5812 XXH_FORCE_INLINE void 5813 XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5814 { 5815 XXH_ASSERT((((size_t)acc) & 31) == 0); 5816 { 5817 __m256i* const xacc = (__m256i*) acc; 5818 const __m256i* const xsecret = (const __m256i *) secret; 5819 const __m256i prime32 = __lasx_xvreplgr2vr_d(XXH_PRIME32_1); 5820 size_t i; 5821 5822 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) { 5823 /* xacc[i] ^= (xacc[i] >> 47) */ 5824 __m256i const acc_vec = xacc[i]; 5825 __m256i const shifted = __lasx_xvsrli_d(acc_vec, 47); 5826 __m256i const data_vec = __lasx_xvxor_v(acc_vec, shifted); 5827 /* xacc[i] ^= xsecret[i]; */ 5828 __m256i const key_vec = __lasx_xvld(xsecret + i, 0); 5829 __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec); 5830 5831 /* xacc[i] *= XXH_PRIME32_1; */ 5832 xacc[i] = __lasx_xvmul_d(data_key, prime32); 5833 } 5834 } 5835 } 5836 5837 #endif 5838 5839 #if (XXH_VECTOR == XXH_RVV) 5840 #define XXH_CONCAT2(X, Y) X ## Y 5841 #define XXH_CONCAT(X, Y) XXH_CONCAT2(X, Y) 5842 #if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \ 5843 (defined(__clang__) && __clang_major__ < 16)) 5844 #define XXH_RVOP(op) op 5845 #define XXH_RVCAST(op) XXH_CONCAT(vreinterpret_v_, op) 5846 #else 5847 #define XXH_RVOP(op) XXH_CONCAT(__riscv_, op) 5848 #define XXH_RVCAST(op) XXH_CONCAT(__riscv_vreinterpret_v_, op) 5849 #endif 5850 XXH_FORCE_INLINE void 5851 XXH3_accumulate_512_rvv( void* XXH_RESTRICT acc, 5852 const void* XXH_RESTRICT input, 5853 const void* XXH_RESTRICT secret) 5854 { 5855 XXH_ASSERT((((size_t)acc) & 63) == 0); 5856 { 5857 // Try to set vector lenght to 512 bits. 5858 // If this length is unavailable, then maximum available will be used 5859 size_t vl = XXH_RVOP(vsetvl_e64m2)(8); 5860 5861 uint64_t* xacc = (uint64_t*) acc; 5862 const uint64_t* xinput = (const uint64_t*) input; 5863 const uint64_t* xsecret = (const uint64_t*) secret; 5864 static const uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 5865 vuint64m2_t xswap_mask = XXH_RVOP(vle64_v_u64m2)(swap_mask, vl); 5866 5867 size_t i; 5868 for (i = 0; i < XXH_STRIPE_LEN/8; i += vl) { 5869 /* data_vec = xinput[i]; */ 5870 vuint64m2_t data_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xinput + i), vl * 8)); 5871 /* key_vec = xsecret[i]; */ 5872 vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)((const uint8_t*)(xsecret + i), vl * 8)); 5873 /* acc_vec = xacc[i]; */ 5874 vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc + i, vl); 5875 /* data_key = data_vec ^ key_vec; */ 5876 vuint64m2_t data_key = XXH_RVOP(vxor_vv_u64m2)(data_vec, key_vec, vl); 5877 /* data_key_hi = data_key >> 32; */ 5878 vuint64m2_t data_key_hi = XXH_RVOP(vsrl_vx_u64m2)(data_key, 32, vl); 5879 /* data_key_lo = data_key & 0xffffffff; */ 5880 vuint64m2_t data_key_lo = XXH_RVOP(vand_vx_u64m2)(data_key, 0xffffffff, vl); 5881 /* swap high and low halves */ 5882 vuint64m2_t data_swap = XXH_RVOP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl); 5883 /* acc_vec += data_key_lo * data_key_hi; */ 5884 acc_vec = XXH_RVOP(vmacc_vv_u64m2)(acc_vec, data_key_lo, data_key_hi, vl); 5885 /* acc_vec += data_swap; */ 5886 acc_vec = XXH_RVOP(vadd_vv_u64m2)(acc_vec, data_swap, vl); 5887 /* xacc[i] = acc_vec; */ 5888 XXH_RVOP(vse64_v_u64m2)(xacc + i, acc_vec, vl); 5889 } 5890 } 5891 } 5892 5893 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv) 5894 5895 XXH_FORCE_INLINE void 5896 XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 5897 { 5898 XXH_ASSERT((((size_t)acc) & 15) == 0); 5899 { 5900 size_t count = XXH_STRIPE_LEN/8; 5901 uint64_t* xacc = (uint64_t*)acc; 5902 const uint8_t* xsecret = (const uint8_t *)secret; 5903 size_t vl; 5904 for (; count > 0; count -= vl, xacc += vl, xsecret += vl*8) { 5905 vl = XXH_RVOP(vsetvl_e64m2)(count); 5906 { 5907 /* key_vec = xsecret[i]; */ 5908 vuint64m2_t key_vec = XXH_RVCAST(u8m2_u64m2)(XXH_RVOP(vle8_v_u8m2)(xsecret, vl*8)); 5909 /* acc_vec = xacc[i]; */ 5910 vuint64m2_t acc_vec = XXH_RVOP(vle64_v_u64m2)(xacc, vl); 5911 /* acc_vec ^= acc_vec >> 47; */ 5912 vuint64m2_t vsrl = XXH_RVOP(vsrl_vx_u64m2)(acc_vec, 47, vl); 5913 acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, vsrl, vl); 5914 /* acc_vec ^= key_vec; */ 5915 acc_vec = XXH_RVOP(vxor_vv_u64m2)(acc_vec, key_vec, vl); 5916 /* acc_vec *= XXH_PRIME32_1; */ 5917 acc_vec = XXH_RVOP(vmul_vx_u64m2)(acc_vec, XXH_PRIME32_1, vl); 5918 /* xacc[i] *= acc_vec; */ 5919 XXH_RVOP(vse64_v_u64m2)(xacc, acc_vec, vl); 5920 } 5921 } 5922 } 5923 } 5924 5925 XXH_FORCE_INLINE void 5926 XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64) 5927 { 5928 XXH_STATIC_ASSERT(XXH_SEC_ALIGN >= 8); 5929 XXH_ASSERT(((size_t)customSecret & 7) == 0); 5930 (void)(&XXH_writeLE64); 5931 { 5932 size_t count = XXH_SECRET_DEFAULT_SIZE/8; 5933 size_t vl; 5934 size_t VLMAX = XXH_RVOP(vsetvlmax_e64m2)(); 5935 int64_t* cSecret = (int64_t*)customSecret; 5936 const int64_t* kSecret = (const int64_t*)(const void*)XXH3_kSecret; 5937 5938 #if __riscv_v_intrinsic >= 1000000 5939 // ratified v1.0 intrinics version 5940 vbool32_t mneg = XXH_RVCAST(u8m1_b32)( 5941 XXH_RVOP(vmv_v_x_u8m1)(0xaa, XXH_RVOP(vsetvlmax_e8m1)())); 5942 #else 5943 // support pre-ratification intrinics, which lack mask to vector casts 5944 size_t vlmax = XXH_RVOP(vsetvlmax_e8m1)(); 5945 vbool32_t mneg = XXH_RVOP(vmseq_vx_u8mf4_b32)( 5946 XXH_RVOP(vand_vx_u8mf4)( 5947 XXH_RVOP(vid_v_u8mf4)(vlmax), 1, vlmax), 1, vlmax); 5948 #endif 5949 vint64m2_t seed = XXH_RVOP(vmv_v_x_i64m2)((int64_t)seed64, VLMAX); 5950 seed = XXH_RVOP(vneg_v_i64m2_mu)(mneg, seed, seed, VLMAX); 5951 5952 for (; count > 0; count -= vl, cSecret += vl, kSecret += vl) { 5953 /* make sure vl=VLMAX until last iteration */ 5954 vl = XXH_RVOP(vsetvl_e64m2)(count < VLMAX ? count : VLMAX); 5955 { 5956 vint64m2_t src = XXH_RVOP(vle64_v_i64m2)(kSecret, vl); 5957 vint64m2_t res = XXH_RVOP(vadd_vv_i64m2)(src, seed, vl); 5958 XXH_RVOP(vse64_v_i64m2)(cSecret, res, vl); 5959 } 5960 } 5961 } 5962 } 5963 #endif 5964 5965 5966 /* scalar variants - universal */ 5967 5968 #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) 5969 /* 5970 * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they 5971 * emit an excess mask and a full 64-bit multiply-add (MADD X-form). 5972 * 5973 * While this might not seem like much, as AArch64 is a 64-bit architecture, only 5974 * big Cortex designs have a full 64-bit multiplier. 5975 * 5976 * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit 5977 * multiplies expand to 2-3 multiplies in microcode. This has a major penalty 5978 * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. 5979 * 5980 * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does 5981 * not have this penalty and does the mask automatically. 5982 */ 5983 XXH_FORCE_INLINE xxh_u64 5984 XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) 5985 { 5986 xxh_u64 ret; 5987 /* note: %x = 64-bit register, %w = 32-bit register */ 5988 __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); 5989 return ret; 5990 } 5991 #else 5992 XXH_FORCE_INLINE xxh_u64 5993 XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) 5994 { 5995 return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; 5996 } 5997 #endif 5998 5999 /*! 6000 * @internal 6001 * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). 6002 * 6003 * This is extracted to its own function because the NEON path uses a combination 6004 * of NEON and scalar. 6005 */ 6006 XXH_FORCE_INLINE void 6007 XXH3_scalarRound(void* XXH_RESTRICT acc, 6008 void const* XXH_RESTRICT input, 6009 void const* XXH_RESTRICT secret, 6010 size_t lane) 6011 { 6012 xxh_u64* xacc = (xxh_u64*) acc; 6013 xxh_u8 const* xinput = (xxh_u8 const*) input; 6014 xxh_u8 const* xsecret = (xxh_u8 const*) secret; 6015 XXH_ASSERT(lane < XXH_ACC_NB); 6016 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); 6017 { 6018 xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); 6019 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); 6020 xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ 6021 xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); 6022 } 6023 } 6024 6025 /*! 6026 * @internal 6027 * @brief Processes a 64 byte block of data using the scalar path. 6028 */ 6029 XXH_FORCE_INLINE void 6030 XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, 6031 const void* XXH_RESTRICT input, 6032 const void* XXH_RESTRICT secret) 6033 { 6034 size_t i; 6035 /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ 6036 #if defined(__GNUC__) && !defined(__clang__) \ 6037 && (defined(__arm__) || defined(__thumb2__)) \ 6038 && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ 6039 && XXH_SIZE_OPT <= 0 6040 # pragma GCC unroll 8 6041 #endif 6042 for (i=0; i < XXH_ACC_NB; i++) { 6043 XXH3_scalarRound(acc, input, secret, i); 6044 } 6045 } 6046 XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) 6047 6048 /*! 6049 * @internal 6050 * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). 6051 * 6052 * This is extracted to its own function because the NEON path uses a combination 6053 * of NEON and scalar. 6054 */ 6055 XXH_FORCE_INLINE void 6056 XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, 6057 void const* XXH_RESTRICT secret, 6058 size_t lane) 6059 { 6060 xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ 6061 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ 6062 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); 6063 XXH_ASSERT(lane < XXH_ACC_NB); 6064 { 6065 xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); 6066 xxh_u64 acc64 = xacc[lane]; 6067 acc64 = XXH_xorshift64(acc64, 47); 6068 acc64 ^= key64; 6069 acc64 *= XXH_PRIME32_1; 6070 xacc[lane] = acc64; 6071 } 6072 } 6073 6074 /*! 6075 * @internal 6076 * @brief Scrambles the accumulators after a large chunk has been read 6077 */ 6078 XXH_FORCE_INLINE void 6079 XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) 6080 { 6081 size_t i; 6082 for (i=0; i < XXH_ACC_NB; i++) { 6083 XXH3_scalarScrambleRound(acc, secret, i); 6084 } 6085 } 6086 6087 XXH_FORCE_INLINE void 6088 XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) 6089 { 6090 /* 6091 * We need a separate pointer for the hack below, 6092 * which requires a non-const pointer. 6093 * Any decent compiler will optimize this out otherwise. 6094 */ 6095 const xxh_u8* kSecretPtr = XXH3_kSecret; 6096 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); 6097 6098 #if defined(__GNUC__) && defined(__aarch64__) 6099 /* 6100 * UGLY HACK: 6101 * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are 6102 * placed sequentially, in order, at the top of the unrolled loop. 6103 * 6104 * While MOVK is great for generating constants (2 cycles for a 64-bit 6105 * constant compared to 4 cycles for LDR), it fights for bandwidth with 6106 * the arithmetic instructions. 6107 * 6108 * I L S 6109 * MOVK 6110 * MOVK 6111 * MOVK 6112 * MOVK 6113 * ADD 6114 * SUB STR 6115 * STR 6116 * By forcing loads from memory (as the asm line causes the compiler to assume 6117 * that XXH3_kSecretPtr has been changed), the pipelines are used more 6118 * efficiently: 6119 * I L S 6120 * LDR 6121 * ADD LDR 6122 * SUB STR 6123 * STR 6124 * 6125 * See XXH3_NEON_LANES for details on the pipeline. 6126 * 6127 * XXH3_64bits_withSeed, len == 256, Snapdragon 835 6128 * without hack: 2654.4 MB/s 6129 * with hack: 3202.9 MB/s 6130 */ 6131 XXH_COMPILER_GUARD(kSecretPtr); 6132 #endif 6133 { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; 6134 int i; 6135 for (i=0; i < nbRounds; i++) { 6136 /* 6137 * The asm hack causes the compiler to assume that kSecretPtr aliases with 6138 * customSecret, and on aarch64, this prevented LDP from merging two 6139 * loads together for free. Putting the loads together before the stores 6140 * properly generates LDP. 6141 */ 6142 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; 6143 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; 6144 XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); 6145 XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); 6146 } } 6147 } 6148 6149 6150 typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); 6151 typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); 6152 typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); 6153 6154 6155 #if (XXH_VECTOR == XXH_AVX512) 6156 6157 #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 6158 #define XXH3_accumulate XXH3_accumulate_avx512 6159 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 6160 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 6161 6162 #elif (XXH_VECTOR == XXH_AVX2) 6163 6164 #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 6165 #define XXH3_accumulate XXH3_accumulate_avx2 6166 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 6167 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 6168 6169 #elif (XXH_VECTOR == XXH_SSE2) 6170 6171 #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 6172 #define XXH3_accumulate XXH3_accumulate_sse2 6173 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 6174 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 6175 6176 #elif (XXH_VECTOR == XXH_NEON) 6177 6178 #define XXH3_accumulate_512 XXH3_accumulate_512_neon 6179 #define XXH3_accumulate XXH3_accumulate_neon 6180 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon 6181 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6182 6183 #elif (XXH_VECTOR == XXH_VSX) 6184 6185 #define XXH3_accumulate_512 XXH3_accumulate_512_vsx 6186 #define XXH3_accumulate XXH3_accumulate_vsx 6187 #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx 6188 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6189 6190 #elif (XXH_VECTOR == XXH_SVE) 6191 #define XXH3_accumulate_512 XXH3_accumulate_512_sve 6192 #define XXH3_accumulate XXH3_accumulate_sve 6193 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar 6194 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6195 6196 #elif (XXH_VECTOR == XXH_LASX) 6197 #define XXH3_accumulate_512 XXH3_accumulate_512_lasx 6198 #define XXH3_accumulate XXH3_accumulate_lasx 6199 #define XXH3_scrambleAcc XXH3_scrambleAcc_lasx 6200 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6201 6202 #elif (XXH_VECTOR == XXH_LSX) 6203 #define XXH3_accumulate_512 XXH3_accumulate_512_lsx 6204 #define XXH3_accumulate XXH3_accumulate_lsx 6205 #define XXH3_scrambleAcc XXH3_scrambleAcc_lsx 6206 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6207 6208 #elif (XXH_VECTOR == XXH_RVV) 6209 #define XXH3_accumulate_512 XXH3_accumulate_512_rvv 6210 #define XXH3_accumulate XXH3_accumulate_rvv 6211 #define XXH3_scrambleAcc XXH3_scrambleAcc_rvv 6212 #define XXH3_initCustomSecret XXH3_initCustomSecret_rvv 6213 6214 #else /* scalar */ 6215 6216 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar 6217 #define XXH3_accumulate XXH3_accumulate_scalar 6218 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar 6219 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6220 6221 #endif 6222 6223 #if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ 6224 # undef XXH3_initCustomSecret 6225 # define XXH3_initCustomSecret XXH3_initCustomSecret_scalar 6226 #endif 6227 6228 XXH_FORCE_INLINE void 6229 XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, 6230 const xxh_u8* XXH_RESTRICT input, size_t len, 6231 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 6232 XXH3_f_accumulate f_acc, 6233 XXH3_f_scrambleAcc f_scramble) 6234 { 6235 size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; 6236 size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; 6237 size_t const nb_blocks = (len - 1) / block_len; 6238 6239 size_t n; 6240 6241 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 6242 6243 for (n = 0; n < nb_blocks; n++) { 6244 f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); 6245 f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); 6246 } 6247 6248 /* last partial block */ 6249 XXH_ASSERT(len > XXH_STRIPE_LEN); 6250 { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; 6251 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); 6252 f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); 6253 6254 /* last stripe */ 6255 { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; 6256 #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ 6257 XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); 6258 } } 6259 } 6260 6261 XXH_FORCE_INLINE xxh_u64 6262 XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) 6263 { 6264 return XXH3_mul128_fold64( 6265 acc[0] ^ XXH_readLE64(secret), 6266 acc[1] ^ XXH_readLE64(secret+8) ); 6267 } 6268 6269 static XXH_PUREF XXH64_hash_t 6270 XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) 6271 { 6272 xxh_u64 result64 = start; 6273 size_t i = 0; 6274 6275 for (i = 0; i < 4; i++) { 6276 result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); 6277 #if defined(__clang__) /* Clang */ \ 6278 && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ 6279 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ 6280 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ 6281 /* 6282 * UGLY HACK: 6283 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as 6284 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. 6285 * XXH3_64bits, len == 256, Snapdragon 835: 6286 * without hack: 2063.7 MB/s 6287 * with hack: 2560.7 MB/s 6288 */ 6289 XXH_COMPILER_GUARD(result64); 6290 #endif 6291 } 6292 6293 return XXH3_avalanche(result64); 6294 } 6295 6296 /* do not align on 8, so that the secret is different from the accumulator */ 6297 #define XXH_SECRET_MERGEACCS_START 11 6298 6299 static XXH_PUREF XXH64_hash_t 6300 XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len) 6301 { 6302 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1); 6303 } 6304 6305 #define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ 6306 XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } 6307 6308 XXH_FORCE_INLINE XXH64_hash_t 6309 XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, 6310 const void* XXH_RESTRICT secret, size_t secretSize, 6311 XXH3_f_accumulate f_acc, 6312 XXH3_f_scrambleAcc f_scramble) 6313 { 6314 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 6315 6316 XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); 6317 6318 /* converge into final hash */ 6319 XXH_STATIC_ASSERT(sizeof(acc) == 64); 6320 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 6321 return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len); 6322 } 6323 6324 /* 6325 * It's important for performance to transmit secret's size (when it's static) 6326 * so that the compiler can properly optimize the vectorized loop. 6327 * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. 6328 * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE 6329 * breaks -Og, this is XXH_NO_INLINE. 6330 */ 6331 XXH3_WITH_SECRET_INLINE XXH64_hash_t 6332 XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, 6333 XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) 6334 { 6335 (void)seed64; 6336 return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); 6337 } 6338 6339 /* 6340 * It's preferable for performance that XXH3_hashLong is not inlined, 6341 * as it results in a smaller function for small data, easier to the instruction cache. 6342 * Note that inside this no_inline function, we do inline the internal loop, 6343 * and provide a statically defined secret size to allow optimization of vector loop. 6344 */ 6345 XXH_NO_INLINE XXH_PUREF XXH64_hash_t 6346 XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, 6347 XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) 6348 { 6349 (void)seed64; (void)secret; (void)secretLen; 6350 return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); 6351 } 6352 6353 /* 6354 * XXH3_hashLong_64b_withSeed(): 6355 * Generate a custom key based on alteration of default XXH3_kSecret with the seed, 6356 * and then use this key for long mode hashing. 6357 * 6358 * This operation is decently fast but nonetheless costs a little bit of time. 6359 * Try to avoid it whenever possible (typically when seed==0). 6360 * 6361 * It's important for performance that XXH3_hashLong is not inlined. Not sure 6362 * why (uop cache maybe?), but the difference is large and easily measurable. 6363 */ 6364 XXH_FORCE_INLINE XXH64_hash_t 6365 XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, 6366 XXH64_hash_t seed, 6367 XXH3_f_accumulate f_acc, 6368 XXH3_f_scrambleAcc f_scramble, 6369 XXH3_f_initCustomSecret f_initSec) 6370 { 6371 #if XXH_SIZE_OPT <= 0 6372 if (seed == 0) 6373 return XXH3_hashLong_64b_internal(input, len, 6374 XXH3_kSecret, sizeof(XXH3_kSecret), 6375 f_acc, f_scramble); 6376 #endif 6377 { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 6378 f_initSec(secret, seed); 6379 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), 6380 f_acc, f_scramble); 6381 } 6382 } 6383 6384 /* 6385 * It's important for performance that XXH3_hashLong is not inlined. 6386 */ 6387 XXH_NO_INLINE XXH64_hash_t 6388 XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, 6389 XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) 6390 { 6391 (void)secret; (void)secretLen; 6392 return XXH3_hashLong_64b_withSeed_internal(input, len, seed, 6393 XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); 6394 } 6395 6396 6397 typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, 6398 XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); 6399 6400 XXH_FORCE_INLINE XXH64_hash_t 6401 XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, 6402 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, 6403 XXH3_hashLong64_f f_hashLong) 6404 { 6405 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); 6406 /* 6407 * If an action is to be taken if `secretLen` condition is not respected, 6408 * it should be done here. 6409 * For now, it's a contract pre-condition. 6410 * Adding a check and a branch here would cost performance at every hash. 6411 * Also, note that function signature doesn't offer room to return an error. 6412 */ 6413 if (len <= 16) 6414 return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); 6415 if (len <= 128) 6416 return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); 6417 if (len <= XXH3_MIDSIZE_MAX) 6418 return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); 6419 return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); 6420 } 6421 6422 6423 /* === Public entry point === */ 6424 6425 /*! @ingroup XXH3_family */ 6426 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) 6427 { 6428 return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); 6429 } 6430 6431 /*! @ingroup XXH3_family */ 6432 XXH_PUBLIC_API XXH64_hash_t 6433 XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) 6434 { 6435 return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); 6436 } 6437 6438 /*! @ingroup XXH3_family */ 6439 XXH_PUBLIC_API XXH64_hash_t 6440 XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) 6441 { 6442 return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); 6443 } 6444 6445 XXH_PUBLIC_API XXH64_hash_t 6446 XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) 6447 { 6448 if (length <= XXH3_MIDSIZE_MAX) 6449 return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); 6450 return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); 6451 } 6452 6453 6454 /* === XXH3 streaming === */ 6455 #ifndef XXH_NO_STREAM 6456 /* 6457 * Malloc's a pointer that is always aligned to @align. 6458 * 6459 * This must be freed with `XXH_alignedFree()`. 6460 * 6461 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte 6462 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 6463 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. 6464 * 6465 * This underalignment previously caused a rather obvious crash which went 6466 * completely unnoticed due to XXH3_createState() not actually being tested. 6467 * Credit to RedSpah for noticing this bug. 6468 * 6469 * The alignment is done manually: Functions like posix_memalign or _mm_malloc 6470 * are avoided: To maintain portability, we would have to write a fallback 6471 * like this anyways, and besides, testing for the existence of library 6472 * functions without relying on external build tools is impossible. 6473 * 6474 * The method is simple: Overallocate, manually align, and store the offset 6475 * to the original behind the returned pointer. 6476 * 6477 * Align must be a power of 2 and 8 <= align <= 128. 6478 */ 6479 static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) 6480 { 6481 XXH_ASSERT(align <= 128 && align >= 8); /* range check */ 6482 XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ 6483 XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ 6484 { /* Overallocate to make room for manual realignment and an offset byte */ 6485 xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); 6486 if (base != NULL) { 6487 /* 6488 * Get the offset needed to align this pointer. 6489 * 6490 * Even if the returned pointer is aligned, there will always be 6491 * at least one byte to store the offset to the original pointer. 6492 */ 6493 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ 6494 /* Add the offset for the now-aligned pointer */ 6495 xxh_u8* ptr = base + offset; 6496 6497 XXH_ASSERT((size_t)ptr % align == 0); 6498 6499 /* Store the offset immediately before the returned pointer. */ 6500 ptr[-1] = (xxh_u8)offset; 6501 return ptr; 6502 } 6503 return NULL; 6504 } 6505 } 6506 /* 6507 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass 6508 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. 6509 */ 6510 static void XXH_alignedFree(void* p) 6511 { 6512 if (p != NULL) { 6513 xxh_u8* ptr = (xxh_u8*)p; 6514 /* Get the offset byte we added in XXH_malloc. */ 6515 xxh_u8 offset = ptr[-1]; 6516 /* Free the original malloc'd pointer */ 6517 xxh_u8* base = ptr - offset; 6518 XXH_free(base); 6519 } 6520 } 6521 /*! @ingroup XXH3_family */ 6522 /*! 6523 * @brief Allocate an @ref XXH3_state_t. 6524 * 6525 * @return An allocated pointer of @ref XXH3_state_t on success. 6526 * @return `NULL` on failure. 6527 * 6528 * @note Must be freed with XXH3_freeState(). 6529 * 6530 * @see @ref streaming_example "Streaming Example" 6531 */ 6532 XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) 6533 { 6534 XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); 6535 if (state==NULL) return NULL; 6536 XXH3_INITSTATE(state); 6537 return state; 6538 } 6539 6540 /*! @ingroup XXH3_family */ 6541 /*! 6542 * @brief Frees an @ref XXH3_state_t. 6543 * 6544 * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). 6545 * 6546 * @return @ref XXH_OK. 6547 * 6548 * @note Must be allocated with XXH3_createState(). 6549 * 6550 * @see @ref streaming_example "Streaming Example" 6551 */ 6552 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) 6553 { 6554 XXH_alignedFree(statePtr); 6555 return XXH_OK; 6556 } 6557 6558 /*! @ingroup XXH3_family */ 6559 XXH_PUBLIC_API void 6560 XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) 6561 { 6562 XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); 6563 } 6564 6565 static void 6566 XXH3_reset_internal(XXH3_state_t* statePtr, 6567 XXH64_hash_t seed, 6568 const void* secret, size_t secretSize) 6569 { 6570 size_t const initStart = offsetof(XXH3_state_t, bufferedSize); 6571 size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; 6572 XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); 6573 XXH_ASSERT(statePtr != NULL); 6574 /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ 6575 XXH_memset((char*)statePtr + initStart, 0, initLength); 6576 statePtr->acc[0] = XXH_PRIME32_3; 6577 statePtr->acc[1] = XXH_PRIME64_1; 6578 statePtr->acc[2] = XXH_PRIME64_2; 6579 statePtr->acc[3] = XXH_PRIME64_3; 6580 statePtr->acc[4] = XXH_PRIME64_4; 6581 statePtr->acc[5] = XXH_PRIME32_2; 6582 statePtr->acc[6] = XXH_PRIME64_5; 6583 statePtr->acc[7] = XXH_PRIME32_1; 6584 statePtr->seed = seed; 6585 statePtr->useSeed = (seed != 0); 6586 statePtr->extSecret = (const unsigned char*)secret; 6587 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 6588 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; 6589 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; 6590 } 6591 6592 /*! @ingroup XXH3_family */ 6593 XXH_PUBLIC_API XXH_errorcode 6594 XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) 6595 { 6596 if (statePtr == NULL) return XXH_ERROR; 6597 XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); 6598 return XXH_OK; 6599 } 6600 6601 /*! @ingroup XXH3_family */ 6602 XXH_PUBLIC_API XXH_errorcode 6603 XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) 6604 { 6605 if (statePtr == NULL) return XXH_ERROR; 6606 XXH3_reset_internal(statePtr, 0, secret, secretSize); 6607 if (secret == NULL) return XXH_ERROR; 6608 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 6609 return XXH_OK; 6610 } 6611 6612 /*! @ingroup XXH3_family */ 6613 XXH_PUBLIC_API XXH_errorcode 6614 XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) 6615 { 6616 if (statePtr == NULL) return XXH_ERROR; 6617 if (seed==0) return XXH3_64bits_reset(statePtr); 6618 if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) 6619 XXH3_initCustomSecret(statePtr->customSecret, seed); 6620 XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); 6621 return XXH_OK; 6622 } 6623 6624 /*! @ingroup XXH3_family */ 6625 XXH_PUBLIC_API XXH_errorcode 6626 XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) 6627 { 6628 if (statePtr == NULL) return XXH_ERROR; 6629 if (secret == NULL) return XXH_ERROR; 6630 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 6631 XXH3_reset_internal(statePtr, seed64, secret, secretSize); 6632 statePtr->useSeed = 1; /* always, even if seed64==0 */ 6633 return XXH_OK; 6634 } 6635 6636 /*! 6637 * @internal 6638 * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). 6639 * 6640 * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. 6641 * 6642 * @param acc Pointer to the 8 accumulator lanes 6643 * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* 6644 * @param nbStripesPerBlock Number of stripes in a block 6645 * @param input Input pointer 6646 * @param nbStripes Number of stripes to process 6647 * @param secret Secret pointer 6648 * @param secretLimit Offset of the last block in @p secret 6649 * @param f_acc Pointer to an XXH3_accumulate implementation 6650 * @param f_scramble Pointer to an XXH3_scrambleAcc implementation 6651 * @return Pointer past the end of @p input after processing 6652 */ 6653 XXH_FORCE_INLINE const xxh_u8 * 6654 XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, 6655 size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, 6656 const xxh_u8* XXH_RESTRICT input, size_t nbStripes, 6657 const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, 6658 XXH3_f_accumulate f_acc, 6659 XXH3_f_scrambleAcc f_scramble) 6660 { 6661 const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; 6662 /* Process full blocks */ 6663 if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { 6664 /* Process the initial partial block... */ 6665 size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; 6666 6667 do { 6668 /* Accumulate and scramble */ 6669 f_acc(acc, input, initialSecret, nbStripesThisIter); 6670 f_scramble(acc, secret + secretLimit); 6671 input += nbStripesThisIter * XXH_STRIPE_LEN; 6672 nbStripes -= nbStripesThisIter; 6673 /* Then continue the loop with the full block size */ 6674 nbStripesThisIter = nbStripesPerBlock; 6675 initialSecret = secret; 6676 } while (nbStripes >= nbStripesPerBlock); 6677 *nbStripesSoFarPtr = 0; 6678 } 6679 /* Process a partial block */ 6680 if (nbStripes > 0) { 6681 f_acc(acc, input, initialSecret, nbStripes); 6682 input += nbStripes * XXH_STRIPE_LEN; 6683 *nbStripesSoFarPtr += nbStripes; 6684 } 6685 /* Return end pointer */ 6686 return input; 6687 } 6688 6689 #ifndef XXH3_STREAM_USE_STACK 6690 # if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ 6691 # define XXH3_STREAM_USE_STACK 1 6692 # endif 6693 #endif 6694 /* This function accepts f_acc and f_scramble as function pointers, 6695 * making it possible to implement multiple variants with different acc & scramble stages. 6696 * This is notably useful to implement multiple vector variants with different intrinsics. 6697 */ 6698 XXH_FORCE_INLINE XXH_errorcode 6699 XXH3_update(XXH3_state_t* XXH_RESTRICT const state, 6700 const xxh_u8* XXH_RESTRICT input, size_t len, 6701 XXH3_f_accumulate f_acc, 6702 XXH3_f_scrambleAcc f_scramble) 6703 { 6704 if (input==NULL) { 6705 XXH_ASSERT(len == 0); 6706 return XXH_OK; 6707 } 6708 6709 XXH_ASSERT(state != NULL); 6710 state->totalLen += len; 6711 6712 /* small input : just fill in tmp buffer */ 6713 XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); 6714 if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { 6715 XXH_memcpy(state->buffer + state->bufferedSize, input, len); 6716 state->bufferedSize += (XXH32_hash_t)len; 6717 return XXH_OK; 6718 } 6719 6720 { const xxh_u8* const bEnd = input + len; 6721 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 6722 #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 6723 /* For some reason, gcc and MSVC seem to suffer greatly 6724 * when operating accumulators directly into state. 6725 * Operating into stack space seems to enable proper optimization. 6726 * clang, on the other hand, doesn't seem to need this trick */ 6727 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; 6728 XXH_memcpy(acc, state->acc, sizeof(acc)); 6729 #else 6730 xxh_u64* XXH_RESTRICT const acc = state->acc; 6731 #endif 6732 6733 /* total input is now > XXH3_INTERNALBUFFER_SIZE */ 6734 #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) 6735 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ 6736 6737 /* 6738 * Internal buffer is partially filled (always, except at beginning) 6739 * Complete it, then consume it. 6740 */ 6741 if (state->bufferedSize) { 6742 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; 6743 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); 6744 input += loadSize; 6745 XXH3_consumeStripes(acc, 6746 &state->nbStripesSoFar, state->nbStripesPerBlock, 6747 state->buffer, XXH3_INTERNALBUFFER_STRIPES, 6748 secret, state->secretLimit, 6749 f_acc, f_scramble); 6750 state->bufferedSize = 0; 6751 } 6752 XXH_ASSERT(input < bEnd); 6753 if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { 6754 size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; 6755 input = XXH3_consumeStripes(acc, 6756 &state->nbStripesSoFar, state->nbStripesPerBlock, 6757 input, nbStripes, 6758 secret, state->secretLimit, 6759 f_acc, f_scramble); 6760 XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); 6761 6762 } 6763 /* Some remaining input (always) : buffer it */ 6764 XXH_ASSERT(input < bEnd); 6765 XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); 6766 XXH_ASSERT(state->bufferedSize == 0); 6767 XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); 6768 state->bufferedSize = (XXH32_hash_t)(bEnd-input); 6769 #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 6770 /* save stack accumulators into state */ 6771 XXH_memcpy(state->acc, acc, sizeof(acc)); 6772 #endif 6773 } 6774 6775 return XXH_OK; 6776 } 6777 6778 /* 6779 * Both XXH3_64bits_update and XXH3_128bits_update use this routine. 6780 */ 6781 XXH_NO_INLINE XXH_errorcode 6782 XXH3_update_regular(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) 6783 { 6784 return XXH3_update(state, (const xxh_u8*)input, len, 6785 XXH3_accumulate, XXH3_scrambleAcc); 6786 } 6787 6788 /*! @ingroup XXH3_family */ 6789 XXH_PUBLIC_API XXH_errorcode 6790 XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) 6791 { 6792 return XXH3_update_regular(state, input, len); 6793 } 6794 6795 6796 XXH_FORCE_INLINE void 6797 XXH3_digest_long (XXH64_hash_t* acc, 6798 const XXH3_state_t* state, 6799 const unsigned char* secret) 6800 { 6801 xxh_u8 lastStripe[XXH_STRIPE_LEN]; 6802 const xxh_u8* lastStripePtr; 6803 6804 /* 6805 * Digest on a local copy. This way, the state remains unaltered, and it can 6806 * continue ingesting more input afterwards. 6807 */ 6808 XXH_memcpy(acc, state->acc, sizeof(state->acc)); 6809 if (state->bufferedSize >= XXH_STRIPE_LEN) { 6810 /* Consume remaining stripes then point to remaining data in buffer */ 6811 size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; 6812 size_t nbStripesSoFar = state->nbStripesSoFar; 6813 XXH3_consumeStripes(acc, 6814 &nbStripesSoFar, state->nbStripesPerBlock, 6815 state->buffer, nbStripes, 6816 secret, state->secretLimit, 6817 XXH3_accumulate, XXH3_scrambleAcc); 6818 lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; 6819 } else { /* bufferedSize < XXH_STRIPE_LEN */ 6820 /* Copy to temp buffer */ 6821 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; 6822 XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ 6823 XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); 6824 XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); 6825 lastStripePtr = lastStripe; 6826 } 6827 /* Last stripe */ 6828 XXH3_accumulate_512(acc, 6829 lastStripePtr, 6830 secret + state->secretLimit - XXH_SECRET_LASTACC_START); 6831 } 6832 6833 /*! @ingroup XXH3_family */ 6834 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) 6835 { 6836 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 6837 if (state->totalLen > XXH3_MIDSIZE_MAX) { 6838 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 6839 XXH3_digest_long(acc, state, secret); 6840 return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen); 6841 } 6842 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ 6843 if (state->useSeed) 6844 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); 6845 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), 6846 secret, state->secretLimit + XXH_STRIPE_LEN); 6847 } 6848 #endif /* !XXH_NO_STREAM */ 6849 6850 6851 /* ========================================== 6852 * XXH3 128 bits (a.k.a XXH128) 6853 * ========================================== 6854 * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, 6855 * even without counting the significantly larger output size. 6856 * 6857 * For example, extra steps are taken to avoid the seed-dependent collisions 6858 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). 6859 * 6860 * This strength naturally comes at the cost of some speed, especially on short 6861 * lengths. Note that longer hashes are about as fast as the 64-bit version 6862 * due to it using only a slight modification of the 64-bit loop. 6863 * 6864 * XXH128 is also more oriented towards 64-bit machines. It is still extremely 6865 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). 6866 */ 6867 6868 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t 6869 XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 6870 { 6871 /* A doubled version of 1to3_64b with different constants. */ 6872 XXH_ASSERT(input != NULL); 6873 XXH_ASSERT(1 <= len && len <= 3); 6874 XXH_ASSERT(secret != NULL); 6875 /* 6876 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } 6877 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } 6878 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } 6879 */ 6880 { xxh_u8 const c1 = input[0]; 6881 xxh_u8 const c2 = input[len >> 1]; 6882 xxh_u8 const c3 = input[len - 1]; 6883 xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) 6884 | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); 6885 xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); 6886 xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; 6887 xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; 6888 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; 6889 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; 6890 XXH128_hash_t h128; 6891 h128.low64 = XXH64_avalanche(keyed_lo); 6892 h128.high64 = XXH64_avalanche(keyed_hi); 6893 return h128; 6894 } 6895 } 6896 6897 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t 6898 XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 6899 { 6900 XXH_ASSERT(input != NULL); 6901 XXH_ASSERT(secret != NULL); 6902 XXH_ASSERT(4 <= len && len <= 8); 6903 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; 6904 { xxh_u32 const input_lo = XXH_readLE32(input); 6905 xxh_u32 const input_hi = XXH_readLE32(input + len - 4); 6906 xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); 6907 xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; 6908 xxh_u64 const keyed = input_64 ^ bitflip; 6909 6910 /* Shift len to the left to ensure it is even, this avoids even multiplies. */ 6911 XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); 6912 6913 m128.high64 += (m128.low64 << 1); 6914 m128.low64 ^= (m128.high64 >> 3); 6915 6916 m128.low64 = XXH_xorshift64(m128.low64, 35); 6917 m128.low64 *= PRIME_MX2; 6918 m128.low64 = XXH_xorshift64(m128.low64, 28); 6919 m128.high64 = XXH3_avalanche(m128.high64); 6920 return m128; 6921 } 6922 } 6923 6924 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t 6925 XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 6926 { 6927 XXH_ASSERT(input != NULL); 6928 XXH_ASSERT(secret != NULL); 6929 XXH_ASSERT(9 <= len && len <= 16); 6930 { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; 6931 xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; 6932 xxh_u64 const input_lo = XXH_readLE64(input); 6933 xxh_u64 input_hi = XXH_readLE64(input + len - 8); 6934 XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); 6935 /* 6936 * Put len in the middle of m128 to ensure that the length gets mixed to 6937 * both the low and high bits in the 128x64 multiply below. 6938 */ 6939 m128.low64 += (xxh_u64)(len - 1) << 54; 6940 input_hi ^= bitfliph; 6941 /* 6942 * Add the high 32 bits of input_hi to the high 32 bits of m128, then 6943 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to 6944 * the high 64 bits of m128. 6945 * 6946 * The best approach to this operation is different on 32-bit and 64-bit. 6947 */ 6948 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ 6949 /* 6950 * 32-bit optimized version, which is more readable. 6951 * 6952 * On 32-bit, it removes an ADC and delays a dependency between the two 6953 * halves of m128.high64, but it generates an extra mask on 64-bit. 6954 */ 6955 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); 6956 } else { 6957 /* 6958 * 64-bit optimized (albeit more confusing) version. 6959 * 6960 * Uses some properties of addition and multiplication to remove the mask: 6961 * 6962 * Let: 6963 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) 6964 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) 6965 * c = XXH_PRIME32_2 6966 * 6967 * a + (b * c) 6968 * Inverse Property: x + y - x == y 6969 * a + (b * (1 + c - 1)) 6970 * Distributive Property: x * (y + z) == (x * y) + (x * z) 6971 * a + (b * 1) + (b * (c - 1)) 6972 * Identity Property: x * 1 == x 6973 * a + b + (b * (c - 1)) 6974 * 6975 * Substitute a, b, and c: 6976 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) 6977 * 6978 * Since input_hi.hi + input_hi.lo == input_hi, we get this: 6979 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) 6980 */ 6981 m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); 6982 } 6983 /* m128 ^= XXH_swap64(m128 >> 64); */ 6984 m128.low64 ^= XXH_swap64(m128.high64); 6985 6986 { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ 6987 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); 6988 h128.high64 += m128.high64 * XXH_PRIME64_2; 6989 6990 h128.low64 = XXH3_avalanche(h128.low64); 6991 h128.high64 = XXH3_avalanche(h128.high64); 6992 return h128; 6993 } } 6994 } 6995 6996 /* 6997 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN 6998 */ 6999 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t 7000 XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) 7001 { 7002 XXH_ASSERT(len <= 16); 7003 { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); 7004 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); 7005 if (len) return XXH3_len_1to3_128b(input, len, secret, seed); 7006 { XXH128_hash_t h128; 7007 xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); 7008 xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); 7009 h128.low64 = XXH64_avalanche(seed ^ bitflipl); 7010 h128.high64 = XXH64_avalanche( seed ^ bitfliph); 7011 return h128; 7012 } } 7013 } 7014 7015 /* 7016 * A bit slower than XXH3_mix16B, but handles multiply by zero better. 7017 */ 7018 XXH_FORCE_INLINE XXH128_hash_t 7019 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, 7020 const xxh_u8* secret, XXH64_hash_t seed) 7021 { 7022 acc.low64 += XXH3_mix16B (input_1, secret+0, seed); 7023 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); 7024 acc.high64 += XXH3_mix16B (input_2, secret+16, seed); 7025 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); 7026 return acc; 7027 } 7028 7029 7030 XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t 7031 XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, 7032 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 7033 XXH64_hash_t seed) 7034 { 7035 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 7036 XXH_ASSERT(16 < len && len <= 128); 7037 7038 { XXH128_hash_t acc; 7039 acc.low64 = len * XXH_PRIME64_1; 7040 acc.high64 = 0; 7041 7042 #if XXH_SIZE_OPT >= 1 7043 { 7044 /* Smaller, but slightly slower. */ 7045 unsigned int i = (unsigned int)(len - 1) / 32; 7046 do { 7047 acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); 7048 } while (i-- != 0); 7049 } 7050 #else 7051 if (len > 32) { 7052 if (len > 64) { 7053 if (len > 96) { 7054 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); 7055 } 7056 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); 7057 } 7058 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); 7059 } 7060 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); 7061 #endif 7062 { XXH128_hash_t h128; 7063 h128.low64 = acc.low64 + acc.high64; 7064 h128.high64 = (acc.low64 * XXH_PRIME64_1) 7065 + (acc.high64 * XXH_PRIME64_4) 7066 + ((len - seed) * XXH_PRIME64_2); 7067 h128.low64 = XXH3_avalanche(h128.low64); 7068 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 7069 return h128; 7070 } 7071 } 7072 } 7073 7074 XXH_NO_INLINE XXH_PUREF XXH128_hash_t 7075 XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, 7076 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 7077 XXH64_hash_t seed) 7078 { 7079 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; 7080 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); 7081 7082 { XXH128_hash_t acc; 7083 unsigned i; 7084 acc.low64 = len * XXH_PRIME64_1; 7085 acc.high64 = 0; 7086 /* 7087 * We set as `i` as offset + 32. We do this so that unchanged 7088 * `len` can be used as upper bound. This reaches a sweet spot 7089 * where both x86 and aarch64 get simple agen and good codegen 7090 * for the loop. 7091 */ 7092 for (i = 32; i < 160; i += 32) { 7093 acc = XXH128_mix32B(acc, 7094 input + i - 32, 7095 input + i - 16, 7096 secret + i - 32, 7097 seed); 7098 } 7099 acc.low64 = XXH3_avalanche(acc.low64); 7100 acc.high64 = XXH3_avalanche(acc.high64); 7101 /* 7102 * NB: `i <= len` will duplicate the last 32-bytes if 7103 * len % 32 was zero. This is an unfortunate necessity to keep 7104 * the hash result stable. 7105 */ 7106 for (i=160; i <= len; i += 32) { 7107 acc = XXH128_mix32B(acc, 7108 input + i - 32, 7109 input + i - 16, 7110 secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, 7111 seed); 7112 } 7113 /* last bytes */ 7114 acc = XXH128_mix32B(acc, 7115 input + len - 16, 7116 input + len - 32, 7117 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 7118 (XXH64_hash_t)0 - seed); 7119 7120 { XXH128_hash_t h128; 7121 h128.low64 = acc.low64 + acc.high64; 7122 h128.high64 = (acc.low64 * XXH_PRIME64_1) 7123 + (acc.high64 * XXH_PRIME64_4) 7124 + ((len - seed) * XXH_PRIME64_2); 7125 h128.low64 = XXH3_avalanche(h128.low64); 7126 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); 7127 return h128; 7128 } 7129 } 7130 } 7131 7132 static XXH_PUREF XXH128_hash_t 7133 XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len) 7134 { 7135 XXH128_hash_t h128; 7136 h128.low64 = XXH3_finalizeLong_64b(acc, secret, len); 7137 h128.high64 = XXH3_mergeAccs(acc, secret + secretSize 7138 - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START, 7139 ~(len * XXH_PRIME64_2)); 7140 return h128; 7141 } 7142 7143 XXH_FORCE_INLINE XXH128_hash_t 7144 XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, 7145 const xxh_u8* XXH_RESTRICT secret, size_t secretSize, 7146 XXH3_f_accumulate f_acc, 7147 XXH3_f_scrambleAcc f_scramble) 7148 { 7149 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; 7150 7151 XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); 7152 7153 /* converge into final hash */ 7154 XXH_STATIC_ASSERT(sizeof(acc) == 64); 7155 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 7156 return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len); 7157 } 7158 7159 /* 7160 * It's important for performance that XXH3_hashLong() is not inlined. 7161 */ 7162 XXH_NO_INLINE XXH_PUREF XXH128_hash_t 7163 XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, 7164 XXH64_hash_t seed64, 7165 const void* XXH_RESTRICT secret, size_t secretLen) 7166 { 7167 (void)seed64; (void)secret; (void)secretLen; 7168 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), 7169 XXH3_accumulate, XXH3_scrambleAcc); 7170 } 7171 7172 /* 7173 * It's important for performance to pass @p secretLen (when it's static) 7174 * to the compiler, so that it can properly optimize the vectorized loop. 7175 * 7176 * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE 7177 * breaks -Og, this is XXH_NO_INLINE. 7178 */ 7179 XXH3_WITH_SECRET_INLINE XXH128_hash_t 7180 XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, 7181 XXH64_hash_t seed64, 7182 const void* XXH_RESTRICT secret, size_t secretLen) 7183 { 7184 (void)seed64; 7185 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, 7186 XXH3_accumulate, XXH3_scrambleAcc); 7187 } 7188 7189 XXH_FORCE_INLINE XXH128_hash_t 7190 XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, 7191 XXH64_hash_t seed64, 7192 XXH3_f_accumulate f_acc, 7193 XXH3_f_scrambleAcc f_scramble, 7194 XXH3_f_initCustomSecret f_initSec) 7195 { 7196 if (seed64 == 0) 7197 return XXH3_hashLong_128b_internal(input, len, 7198 XXH3_kSecret, sizeof(XXH3_kSecret), 7199 f_acc, f_scramble); 7200 { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 7201 f_initSec(secret, seed64); 7202 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), 7203 f_acc, f_scramble); 7204 } 7205 } 7206 7207 /* 7208 * It's important for performance that XXH3_hashLong is not inlined. 7209 */ 7210 XXH_NO_INLINE XXH128_hash_t 7211 XXH3_hashLong_128b_withSeed(const void* input, size_t len, 7212 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) 7213 { 7214 (void)secret; (void)secretLen; 7215 return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, 7216 XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); 7217 } 7218 7219 typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, 7220 XXH64_hash_t, const void* XXH_RESTRICT, size_t); 7221 7222 XXH_FORCE_INLINE XXH128_hash_t 7223 XXH3_128bits_internal(const void* input, size_t len, 7224 XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, 7225 XXH3_hashLong128_f f_hl128) 7226 { 7227 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); 7228 /* 7229 * If an action is to be taken if `secret` conditions are not respected, 7230 * it should be done here. 7231 * For now, it's a contract pre-condition. 7232 * Adding a check and a branch here would cost performance at every hash. 7233 */ 7234 if (len <= 16) 7235 return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); 7236 if (len <= 128) 7237 return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); 7238 if (len <= XXH3_MIDSIZE_MAX) 7239 return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); 7240 return f_hl128(input, len, seed64, secret, secretLen); 7241 } 7242 7243 7244 /* === Public XXH128 API === */ 7245 7246 /*! @ingroup XXH3_family */ 7247 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) 7248 { 7249 return XXH3_128bits_internal(input, len, 0, 7250 XXH3_kSecret, sizeof(XXH3_kSecret), 7251 XXH3_hashLong_128b_default); 7252 } 7253 7254 /*! @ingroup XXH3_family */ 7255 XXH_PUBLIC_API XXH128_hash_t 7256 XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) 7257 { 7258 return XXH3_128bits_internal(input, len, 0, 7259 (const xxh_u8*)secret, secretSize, 7260 XXH3_hashLong_128b_withSecret); 7261 } 7262 7263 /*! @ingroup XXH3_family */ 7264 XXH_PUBLIC_API XXH128_hash_t 7265 XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) 7266 { 7267 return XXH3_128bits_internal(input, len, seed, 7268 XXH3_kSecret, sizeof(XXH3_kSecret), 7269 XXH3_hashLong_128b_withSeed); 7270 } 7271 7272 /*! @ingroup XXH3_family */ 7273 XXH_PUBLIC_API XXH128_hash_t 7274 XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) 7275 { 7276 if (len <= XXH3_MIDSIZE_MAX) 7277 return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); 7278 return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); 7279 } 7280 7281 /*! @ingroup XXH3_family */ 7282 XXH_PUBLIC_API XXH128_hash_t 7283 XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) 7284 { 7285 return XXH3_128bits_withSeed(input, len, seed); 7286 } 7287 7288 7289 /* === XXH3 128-bit streaming === */ 7290 #ifndef XXH_NO_STREAM 7291 /* 7292 * All initialization and update functions are identical to 64-bit streaming variant. 7293 * The only difference is the finalization routine. 7294 */ 7295 7296 /*! @ingroup XXH3_family */ 7297 XXH_PUBLIC_API XXH_errorcode 7298 XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) 7299 { 7300 return XXH3_64bits_reset(statePtr); 7301 } 7302 7303 /*! @ingroup XXH3_family */ 7304 XXH_PUBLIC_API XXH_errorcode 7305 XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) 7306 { 7307 return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); 7308 } 7309 7310 /*! @ingroup XXH3_family */ 7311 XXH_PUBLIC_API XXH_errorcode 7312 XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) 7313 { 7314 return XXH3_64bits_reset_withSeed(statePtr, seed); 7315 } 7316 7317 /*! @ingroup XXH3_family */ 7318 XXH_PUBLIC_API XXH_errorcode 7319 XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) 7320 { 7321 return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); 7322 } 7323 7324 /*! @ingroup XXH3_family */ 7325 XXH_PUBLIC_API XXH_errorcode 7326 XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) 7327 { 7328 return XXH3_update_regular(state, input, len); 7329 } 7330 7331 /*! @ingroup XXH3_family */ 7332 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) 7333 { 7334 const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; 7335 if (state->totalLen > XXH3_MIDSIZE_MAX) { 7336 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; 7337 XXH3_digest_long(acc, state, secret); 7338 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); 7339 return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN, (xxh_u64)state->totalLen); 7340 } 7341 /* len <= XXH3_MIDSIZE_MAX : short code */ 7342 if (state->useSeed) 7343 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); 7344 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), 7345 secret, state->secretLimit + XXH_STRIPE_LEN); 7346 } 7347 #endif /* !XXH_NO_STREAM */ 7348 /* 128-bit utility functions */ 7349 7350 /* return : 1 is equal, 0 if different */ 7351 /*! @ingroup XXH3_family */ 7352 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) 7353 { 7354 /* note : XXH128_hash_t is compact, it has no padding byte */ 7355 return !(XXH_memcmp(&h1, &h2, sizeof(h1))); 7356 } 7357 7358 /* This prototype is compatible with stdlib's qsort(). 7359 * @return : >0 if *h128_1 > *h128_2 7360 * <0 if *h128_1 < *h128_2 7361 * =0 if *h128_1 == *h128_2 */ 7362 /*! @ingroup XXH3_family */ 7363 XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) 7364 { 7365 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; 7366 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; 7367 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); 7368 /* note : bets that, in most cases, hash values are different */ 7369 if (hcmp) return hcmp; 7370 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); 7371 } 7372 7373 7374 /*====== Canonical representation ======*/ 7375 /*! @ingroup XXH3_family */ 7376 XXH_PUBLIC_API void 7377 XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) 7378 { 7379 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); 7380 if (XXH_CPU_LITTLE_ENDIAN) { 7381 hash.high64 = XXH_swap64(hash.high64); 7382 hash.low64 = XXH_swap64(hash.low64); 7383 } 7384 XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); 7385 XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); 7386 } 7387 7388 /*! @ingroup XXH3_family */ 7389 XXH_PUBLIC_API XXH128_hash_t 7390 XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) 7391 { 7392 XXH128_hash_t h; 7393 h.high64 = XXH_readBE64(src); 7394 h.low64 = XXH_readBE64(src->digest + 8); 7395 return h; 7396 } 7397 7398 7399 7400 /* ========================================== 7401 * Secret generators 7402 * ========================================== 7403 */ 7404 #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) 7405 7406 XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) 7407 { 7408 XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); 7409 XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); 7410 } 7411 7412 /*! @ingroup XXH3_family */ 7413 XXH_PUBLIC_API XXH_errorcode 7414 XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) 7415 { 7416 #if (XXH_DEBUGLEVEL >= 1) 7417 XXH_ASSERT(secretBuffer != NULL); 7418 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); 7419 #else 7420 /* production mode, assert() are disabled */ 7421 if (secretBuffer == NULL) return XXH_ERROR; 7422 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; 7423 #endif 7424 7425 if (customSeedSize == 0) { 7426 customSeed = XXH3_kSecret; 7427 customSeedSize = XXH_SECRET_DEFAULT_SIZE; 7428 } 7429 #if (XXH_DEBUGLEVEL >= 1) 7430 XXH_ASSERT(customSeed != NULL); 7431 #else 7432 if (customSeed == NULL) return XXH_ERROR; 7433 #endif 7434 7435 /* Fill secretBuffer with a copy of customSeed - repeat as needed */ 7436 { size_t pos = 0; 7437 while (pos < secretSize) { 7438 size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); 7439 XXH_memcpy((char*)secretBuffer + pos, customSeed, toCopy); 7440 pos += toCopy; 7441 } } 7442 7443 { size_t const nbSeg16 = secretSize / 16; 7444 size_t n; 7445 XXH128_canonical_t scrambler; 7446 XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); 7447 for (n=0; n<nbSeg16; n++) { 7448 XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n); 7449 XXH3_combine16((char*)secretBuffer + n*16, h128); 7450 } 7451 /* last segment */ 7452 XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler)); 7453 } 7454 return XXH_OK; 7455 } 7456 7457 /*! @ingroup XXH3_family */ 7458 XXH_PUBLIC_API void 7459 XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) 7460 { 7461 XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; 7462 XXH3_initCustomSecret(secret, seed); 7463 XXH_ASSERT(secretBuffer != NULL); 7464 XXH_memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); 7465 } 7466 7467 7468 7469 /* Pop our optimization override from above */ 7470 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ 7471 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ 7472 && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ 7473 # pragma GCC pop_options 7474 #endif 7475 7476 #endif /* XXH_NO_LONG_LONG */ 7477 7478 #endif /* XXH_NO_XXH3 */ 7479 7480 /*! 7481 * @} 7482 */ 7483 #endif /* XXH_IMPLEMENTATION */ 7484 7485 7486 #if defined (__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD) 7487 } /* extern "C" */ 7488 #endif