@@ -924,174 +924,7 @@ static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc)
924924 ^ shift_table [3 ][crc >> 24 ];
925925}
926926
927- /* Compute CRC-32C using the Intel hardware instruction. */
928- uint32_t crc32c_append_hw (uint32_t crc , buffer buf , size_t len )
929- {
930- buffer next = buf ;
931- buffer end ;
932- #ifdef _M_X64
933- uint64_t crc0 , crc1 , crc2 ; /* need to be 64 bits for crc32q */
934- #else
935- uint32_t crc0 , crc1 , crc2 ;
936- #endif
937-
938- /* pre-process the crc */
939- crc0 = crc ^ 0xffffffff ;
940-
941- /* compute the crc for up to seven leading bytes to bring the data pointer
942- to an eight-byte boundary */
943- while (len && ((uintptr_t )next & 7 ) != 0 )
944- {
945- crc0 = _mm_crc32_u8 ((uint32_t )(crc0 ), * next );
946- ++ next ;
947- -- len ;
948- }
949-
950- #ifdef _M_X64
951- /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
952- instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
953- Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
954- throughput of one crc per cycle, but a latency of three cycles */
955- while (len >= 3 * LONG_SHIFT )
956- {
957- crc1 = 0 ;
958- crc2 = 0 ;
959- end = next + LONG_SHIFT ;
960- do
961- {
962- crc0 = _mm_crc32_u64 (crc0 , * (const uint64_t * )(next ));
963- crc1 = _mm_crc32_u64 (crc1 , * (const uint64_t * )(next + LONG_SHIFT ));
964- crc2 = _mm_crc32_u64 (crc2 , * (const uint64_t * )(next + 2 * LONG_SHIFT ));
965- next += 8 ;
966- } while (next < end );
967- crc0 = shift_crc (long_shifts .dword_table , (uint32_t )(crc0 )) ^ crc1 ;
968- crc0 = shift_crc (long_shifts .dword_table , (uint32_t )(crc0 )) ^ crc2 ;
969- next += 2 * LONG_SHIFT ;
970- len -= 3 * LONG_SHIFT ;
971- }
972-
973- /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
974- than a LONG_SHIFT*3 block */
975- while (len >= 3 * SHORT_SHIFT )
976- {
977- crc1 = 0 ;
978- crc2 = 0 ;
979- end = next + SHORT_SHIFT ;
980- do
981- {
982- crc0 = _mm_crc32_u64 (crc0 , * (const uint64_t * )(next ));
983- crc1 = _mm_crc32_u64 (crc1 , * (const uint64_t * )(next + SHORT_SHIFT ));
984- crc2 = _mm_crc32_u64 (crc2 , * (const uint64_t * )(next + 2 * SHORT_SHIFT ));
985- next += 8 ;
986- } while (next < end );
987- crc0 = shift_crc (short_shifts .dword_table , (uint32_t )(crc0 )) ^ crc1 ;
988- crc0 = shift_crc (short_shifts .dword_table , (uint32_t )(crc0 )) ^ crc2 ;
989- next += 2 * SHORT_SHIFT ;
990- len -= 3 * SHORT_SHIFT ;
991- }
992-
993- /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
994- block */
995- end = next + (len - (len & 7 ));
996- while (next < end )
997- {
998- crc0 = _mm_crc32_u64 (crc0 , * (const uint64_t * )(next ));
999- next += 8 ;
1000- }
1001- #else
1002- /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
1003- instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
1004- Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
1005- throughput of one crc per cycle, but a latency of three cycles */
1006- while (len >= 3 * LONG_SHIFT )
1007- {
1008- crc1 = 0 ;
1009- crc2 = 0 ;
1010- end = next + LONG_SHIFT ;
1011- do
1012- {
1013- crc0 = _mm_crc32_u32 (crc0 , * (const uint32_t * )(next ));
1014- crc1 = _mm_crc32_u32 (crc1 , * (const uint32_t * )(next + LONG_SHIFT ));
1015- crc2 = _mm_crc32_u32 (crc2 , * (const uint32_t * )(next + 2 * LONG_SHIFT ));
1016- next += 4 ;
1017- } while (next < end );
1018- crc0 = shift_crc (long_shifts .dword_table , (uint32_t )(crc0 )) ^ crc1 ;
1019- crc0 = shift_crc (long_shifts .dword_table , (uint32_t )(crc0 )) ^ crc2 ;
1020- next += 2 * LONG_SHIFT ;
1021- len -= 3 * LONG_SHIFT ;
1022- }
1023-
1024- /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
1025- than a LONG_SHIFT*3 block */
1026- while (len >= 3 * SHORT_SHIFT )
1027- {
1028- crc1 = 0 ;
1029- crc2 = 0 ;
1030- end = next + SHORT_SHIFT ;
1031- do
1032- {
1033- crc0 = _mm_crc32_u32 (crc0 , * (const uint32_t * )(next ));
1034- crc1 = _mm_crc32_u32 (crc1 , * (const uint32_t * )(next + SHORT_SHIFT ));
1035- crc2 = _mm_crc32_u32 (crc2 , * (const uint32_t * )(next + 2 * SHORT_SHIFT ));
1036- next += 4 ;
1037- } while (next < end );
1038- crc0 = shift_crc (short_shifts .dword_table , (uint32_t )(crc0 )) ^ crc1 ;
1039- crc0 = shift_crc (short_shifts .dword_table , (uint32_t )(crc0 )) ^ crc2 ;
1040- next += 2 * SHORT_SHIFT ;
1041- len -= 3 * SHORT_SHIFT ;
1042- }
1043-
1044- /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
1045- block */
1046- end = next + (len - (len & 7 ));
1047- while (next < end )
1048- {
1049- crc0 = _mm_crc32_u32 (crc0 , * (const uint32_t * )(next ));
1050- next += 4 ;
1051- }
1052- #endif
1053- len &= 7 ;
1054-
1055- /* compute the crc for up to seven trailing bytes */
1056- while (len )
1057- {
1058- crc0 = _mm_crc32_u8 ((uint32_t )(crc0 ), * next );
1059- ++ next ;
1060- -- len ;
1061- }
1062-
1063- /* return a post-processed crc */
1064- return (uint32_t )(crc0 ) ^ 0xffffffff ;
1065- }
1066-
1067- int crc32c_hw_available ()
1068- {
1069- int info [4 ];
1070- #ifdef CRC32C_GCC
1071- __cpuid (1 , info [0 ], info [1 ], info [2 ], info [3 ]);
1072- #else
1073- __cpuid (info , 1 );
1074- #endif
1075- return (info [2 ] & (1 << 20 )) != 0 ;
1076- }
1077-
1078- uint32_t (* append_func )(uint32_t , buffer , size_t )
1079- #ifdef __cplusplus
1080- = crc32c_hw_available () ? crc32c_append_hw : crc32c_append_sw ;
1081- #else
1082- = crc32c_append_sw ;
1083- #endif
1084-
1085- #ifndef __cplusplus
1086- void crc32c_init ()
1087- {
1088- if (crc32c_hw_available ()) {
1089- append_func = crc32c_append_hw ;
1090- }
1091- }
1092- #endif
1093-
1094927uint32_t crc32c_append (uint32_t crc , buffer input , size_t length )
1095928{
1096- return append_func (crc , input , length );
929+ return crc32c_append_sw (crc , input , length );
1097930}
0 commit comments