Benq 2D offline BIT explanation?

Benq’s 2D offline BIT is mentioned here. Can someone provide an explanation to his code?

Here’s a cleaner solution to DMOPC '19 Contest 7 P5 - Soriya's Programming Project - DMOJ: Modern Online Judge that uses it:

#include<bits/stdc++.h>
using namespace std;

template<class T> using V = vector<T>;
using pi = pair<int,int>;

template<class T, int SZ> class BIT {
	T bit[SZ]{};
public:
	void update(int x, T t) {
		for (; x < SZ; x += x&-x)
			bit[x] += t;
	}
	T query(int x) {
		T res = 0;
		for (; x; x -= x&-x)
			res += bit[x];
		return res;
	}
};

/**
 * The following data structure supports 2D range update + query, where the points to be updated
   are known beforehand.
 * The idea is to store a separate 1D BIT for each x. The x-th 1D BIT supports operations
   - update bit[y] += t where y must be one of all_y[y_start[x] ... y_start[x] + y_cnt[x]-1]
   - query sum_{y' <= y} bit[y'] for any y
 * Memory: O(U log SZ) where U is the number of updates
 * Time: O(log U log SZ) per update or query
 */

template<class T, int SZ> class OffBIT2D {
	bool initialized = 0;
	V<int> all_y;
	int y_cnt[SZ]{};
	int y_start[SZ]{};
	V<T> bit;
	int rank_of_y(int x, int y) { // get rank of y in distinct y-values for x
		int l = y_start[x], r = y_start[x]+y_cnt[x];
		return upper_bound(begin(all_y)+l,begin(all_y)+r,y)-begin(all_y)-l;
	}
public:
	// all updates {x,y} must satisfy x in (0, SZ), no restrictions on y
	void init(V<pair<int,int>> updates) {
		assert(!initialized);
		initialized = true;
		sort(begin(updates), end(updates),[](const pi& a, const pi& b) { 
			return a.second < b.second; 
		});
		for (auto [x, y]: updates) // push to log(N) different BITs
			for (; x < SZ; x += x&-x)
				++y_cnt[x];
		int total_size = 0;
		for (int i = 0; i < SZ; ++i)
			y_start[i] = (total_size += y_cnt[i]);
		all_y.resize(total_size), bit.resize(total_size);
		reverse(begin(updates), end(updates));
		for (auto [x, y]: updates)
			for (; x < SZ; x += x&-x)
				all_y[--y_start[x]] = y;
	}
	void update(int x, int _y, T t) {
		assert(initialized);
		for (; x < SZ; x += x&-x)
			for (int y = rank_of_y(x,_y); y <= y_cnt[x]; y += y&-y) 
				bit[y_start[x]+y-1] += t;
	}
	T query(int x, int _y) {
		assert(initialized);
		T res = 0;
		for (; x; x -= x&-x)
			for (int y = rank_of_y(x,_y); y; y -= y&-y)
				res += bit[y_start[x]+y-1];
		return res;
	}
};


/**
 * Description: A set (not multiset!) with support for finding the $n$'th
 * element, and finding the index of an element. Change \texttt{null\_type} for map.
 * Time: O(\log N)
 * Source: KACTL
   * https://codeforces.com/blog/entry/11080
 * Verification: many
 */

#include <ext/pb_ds/assoc_container.hpp>
using namespace __gnu_pbds;
template<class T> using Tree = tree<T, null_type, less<T>, 
	rb_tree_tag, tree_order_statistics_node_update>; 
#define ook order_of_key
#define fbo find_by_order

int main() {
	ios::sync_with_stdio(false);
	cin.tie(nullptr);
	int N; cin >> N;

	V<int> A(N);
	for (int& i: A) cin >> i;

	// make A a permutation
	V<int> A_idx(N); iota(begin(A_idx), end(A_idx), 0);
	stable_sort(begin(A_idx), end(A_idx),[&](int x, int y) {
		return A[x] < A[y];
	});
	for (int i = 0; i < N; ++i)
		A[A_idx[i]] = i+1;

	vector<int> P(N);
	for (int& i: P) cin >> i;

	OffBIT2D<int,500005> O;
	V<pi> updates(N);
	for (int i = 0; i < N; ++i) {
		const int x = A[P[i]-1];
		updates[i] = {P[i],x};
	}
	O.init(updates);

	BIT<int, 500005> t1, t2;
	int64_t ans = 0;
	for (int i = 0; i < N; ++i) {
		const int x = A[P[i]-1];
		const int lower_left = O.query(P[i], x);
		O.update(P[i], x, 1);
		ans += t1.query(P[i])-lower_left;
		ans += t2.query(x)-lower_left;
		t1.update(P[i], 1), t2.update(x, 1);
		cout << ans << "\n";
	}
}

This version (which should be easier to understand) uses arrays of vectors so it ends up using nearly twice the memory of the previous code (but is nearly as fast):

#include<bits/stdc++.h>
using namespace std;

template<class T> using V = vector<T>;
using pi = pair<int,int>;

template<class T, int SZ> class BIT {
	T bit[SZ]{};
public:
	void update(int x, T t) {
		for (; x < SZ; x += x&-x)
			bit[x] += t;
	}
	T query(int x) {
		T res = 0;
		for (; x; x -= x&-x)
			res += bit[x];
		return res;
	}
};

/**
 * The following data structure supports 2D range update + query, where the points to be updated
   are known beforehand.
 * The idea is to store a separate 1D BIT for each x. The x-th 1D BIT supports operations
   - update bit[y] += t where y must be one of all_y[y_start[x] ... y_start[x] + y_cnt[x]-1]
   - query sum_{y' <= y} bit[y'] for any y
 * Memory: O(U log SZ) where U is the number of updates
 * Time: O(log U log SZ) per update or query
 */

template<class T, int SZ> class OffBIT2D {
	bool initialized = 0;
	// V<int> all_y;
	vector<int> all_y[SZ];
	// int y_cnt[SZ]{};
	// int y_start[SZ]{};
	// V<T> bit;
	V<T> bit[SZ];
	int rank_of_y(int x, int y) { // get rank of y in distinct y-values for x
		// int l = y_start[x], r = y_start[x]+y_cnt[x];
		// return upper_bound(begin(all_y)+l,begin(all_y)+r,y)-begin(all_y)-l;
		return upper_bound(begin(all_y[x]),end(all_y[x]), y)-begin(all_y[x]);
	}
public:
	// all updates {x,y} must satisfy x in (0, SZ), no restrictions on y
	void init(V<pair<int,int>> updates) {
		assert(!initialized);
		initialized = true;
		sort(begin(updates), end(updates),[](const pi& a, const pi& b) { 
			return a.second < b.second; 
		});
		vector<int> y_cnt(SZ);
		for (auto [x, y]: updates) // push to log(N) different BITs
			for (; x < SZ; x += x&-x) {
				++y_cnt[x];
			}
		for (int x = 0; x < SZ; ++x) {
			all_y[x].reserve(y_cnt[x]);
			bit[x].resize(y_cnt[x]);
		}
		for (auto [x, y]: updates) {
			for (; x < SZ; x += x&-x) {
				all_y[x].push_back(y);
			}
		}
	}
	void update(int x, int _y, T t) {
		assert(initialized);
		for (; x < SZ; x += x&-x)
			for (int y = rank_of_y(x,_y); y <= size(all_y[x]); y += y&-y) 
				bit[x][y-1] += t;
	}
	T query(int x, int _y) {
		assert(initialized);
		T res = 0;
		for (; x; x -= x&-x)
			for (int y = rank_of_y(x,_y); y; y -= y&-y)
				res += bit[x][y-1];
		return res;
	}
};


/**
 * Description: A set (not multiset!) with support for finding the $n$'th
 * element, and finding the index of an element. Change \texttt{null\_type} for map.
 * Time: O(\log N)
 * Source: KACTL
   * https://codeforces.com/blog/entry/11080
 * Verification: many
 */

#include <ext/pb_ds/assoc_container.hpp>
using namespace __gnu_pbds;
template<class T> using Tree = tree<T, null_type, less<T>, 
	rb_tree_tag, tree_order_statistics_node_update>; 
#define ook order_of_key
#define fbo find_by_order

int main() {
	ios::sync_with_stdio(false);
	cin.tie(nullptr);
	int N; cin >> N;

	V<int> A(N);
	for (int& i: A) cin >> i;

	// make A a permutation
	V<int> A_idx(N); iota(begin(A_idx), end(A_idx), 0);
	stable_sort(begin(A_idx), end(A_idx),[&](int x, int y) {
		return A[x] < A[y];
	});
	for (int i = 0; i < N; ++i)
		A[A_idx[i]] = i+1;

	vector<int> P(N);
	for (int& i: P) cin >> i;

	OffBIT2D<int,500005> O;
	V<pi> updates(N);
	for (int i = 0; i < N; ++i) {
		const int x = A[P[i]-1];
		updates[i] = {P[i],x};
	}
	O.init(updates);

	BIT<int, 500005> t1, t2;
	int64_t ans = 0;
	for (int i = 0; i < N; ++i) {
		const int x = A[P[i]-1];
		const int lower_left = O.query(P[i], x);
		O.update(P[i], x, 1);
		ans += t1.query(P[i])-lower_left;
		ans += t2.query(x)-lower_left;
		t1.update(P[i], 1), t2.update(x, 1);
		cout << ans << "\n";
	}
}
1 Like