Just now I was reading about String Hashing, and I read the warning saying that it is best to generate the base for polynomial hashing randomly during contests where there is open hacking, such as div. 2 educational rounds on CodeForces. But what about USACO? Is it okay if I just pick a base such as, say 2^29 - 1 (a mersenne prime thats ~500 million) and a modulo such as, say, 2^59 - 1 (also a mersenne prime)? Or are there test cases specifically designed to hack certain bases/modulos that are arbitrarily large?
Thanks. But for some reason, B=2^31-1 and M=2^61-1 failed on Bovine Genomics on test cases 3,4, and 8 (?). However, B=2^29-1 and B=1,000,000,007 passed (M=2^61-1)? Note that test case 3 has very light constraints (N=25, M=50).
#include <bits/stdc++.h>
using namespace std;
struct GenomeHasher {
long long base = (1LL << 31) - 1, modulo = (1LL << 61) - 1;
vector<long long> power_modulo = {1};
vector<long long> prefix_hashes = {0};
GenomeHasher(const string &genome) {
for (char character: genome) {
power_modulo.push_back((__int128) power_modulo.back() * base % modulo);
int value;
switch (character) {
case 'A':
value = 0;
break;
case 'C':
value = 1;
break;
case 'G':
value = 2;
break;
case 'T':
value = 3;
break;
default: // T
assert(false);
}
prefix_hashes.push_back(((__int128) prefix_hashes.back() * base + value) % modulo);
}
}
long long hash(int start, int end) {
if (end < start) return 0;
return ((__int128) prefix_hashes[end] - (__int128) prefix_hashes[start - 1] * power_modulo[end - start + 1]) % modulo;
}
};
int main() {
ios_base::sync_with_stdio(false);
cin.tie(nullptr);
freopen("cownomics.in", "r", stdin);
freopen("cownomics.out", "w", stdout);
int amount, length;
cin >> amount >> length;
vector<GenomeHasher> spotty_hashers, plain_hashers;
for (int index = 0; index < amount; index++) {
string genome;
cin >> genome;
spotty_hashers.push_back(GenomeHasher(genome));
}
for (int index = 0; index < amount; index++) {
string genome;
cin >> genome;
plain_hashers.push_back(GenomeHasher(genome));
}
function check = [&](int sequence_length) {
for (int start = 1; start <= length - sequence_length + 1; start++) {
unordered_set<long long> spotty_hashes;
for (GenomeHasher spotty_hasher: spotty_hashers) {
spotty_hashes.insert(spotty_hasher.hash(start, start + sequence_length - 1));
}
bool works = true;
for (GenomeHasher plain_hasher: plain_hashers) {
if (spotty_hashes.count(plain_hasher.hash(start, start + sequence_length - 1))) {
works = false;
break;
}
}
if (works) {
return true;
}
}
return false;
};
int left = 1, right = length;
while (left < right) {
int middle = (left + right) / 2;
if (check(middle)) {
right = middle;
}
else {
left = middle + 1;
}
}
cout << left << "\n";
return 0;
}
Either your solution is wrong (it’s strange to why your hash function might return a negative number) and/or that specific combination of B / M is vulnerable to collisions. Let me look into it further …
If you pick B uniformly at random from the range 0 … M - 1 then you shouldn’t have any issues.
I looked into the failing test case and it looks like CGAA and AACA collide. This happens because B^2+2B-1\equiv 0\pmod M.
But if you select B uniformly at random from the range [0, M-1], the probability of this polynomial being zero is only 2/M since it only has two roots modulo M.