/* You can compile this source by writing g++ -march=native main.cpp */ #include #include #include #include typedef unsigned int u32; typedef unsigned long long u64; char lowercase(int c) { return (c >= 'A' && c <= 'Z') ? c|32 : c; } bool InsensitiveCompare(const char*haystack, const char* needle, int len) { for(int i=0; i&results, const char *needle, int needleLen, const char*file, int len) { assert(needleLen>3); assert(!(len & 31)); const auto firstLetter = _mm256_set1_epi8(needle[0]); const auto secondLetter = _mm256_set1_epi8(needle[1]); const auto thirdLetter = _mm256_set1_epi8(needle[2]); const auto addAmount = _mm256_set1_epi8(37); const auto Val101 = _mm256_set1_epi8(101); const auto toLowerValue = _mm256_set1_epi8(32); auto letters = _mm256_load_si256((__m256i*)file); auto upperMask = _mm256_cmpgt_epi8(_mm256_add_epi8(letters, addAmount), Val101); auto lowercaseLetters = _mm256_add_epi8(letters, _mm256_and_si256(upperMask, toLowerValue)); auto fistMask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(lowercaseLetters, firstLetter)); auto secondMask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(lowercaseLetters, secondLetter)); auto thirdMask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(lowercaseLetters, thirdLetter)); //We should use a while statement and carry bits from the previous loop to the next //but the code was hard to read so this example only checks one 32byte buffer u32 bits = (fistMask<<2) & (secondMask<<1) & (thirdMask); while (bits) { int i = _tzcnt_u32(bits); if (InsensitiveCompare(&file[i+1], &needle[3], needleLen-3)) { results.push_back(i-2); auto ignoreBits = (1ULL<<(i+needleLen))-1; bits &= (~ignoreBits); continue; } bits &= bits-1; } } int main() { char buf[256]; char*p = (char*)(u64(buf+31) & -32); //Have p be 32byte aligned memcpy(p, "12ABCDEABCDEF", 13); std::vector results; FindInsensitiveAligned(results, "abcde", 5, buf, 32); assert(results.size() == 2 && results[0] == 2 && results[1] == 7); }