factor out and unit test parts of the DHT routing table logic that handles the affinity of nodes to insert. Specifically, make sure the even distribution within routing table buckets works correctly

2019-07-27 14:20:33 -07:00 · 2019-07-27 14:20:33 -07:00 · b1b03a0d77
parent c5d0ea6332
commit b1b03a0d77
3 changed files with 406 additions and 220 deletions
--- a/include/libtorrent/kademlia/routing_table.hpp
+++ b/include/libtorrent/kademlia/routing_table.hpp
@ -80,7 +80,7 @@ struct ipv6_hash
 	}
 };

-struct ip_set
+struct TORRENT_EXTRA_EXPORT ip_set
 {
 	void insert(address const& addr);
 	bool exists(address const& addr) const;
@ -97,12 +97,23 @@ struct ip_set
 		return m_ip4s == rh.m_ip4s && m_ip6s == rh.m_ip6s;
 	}

+	std::size_t size() const { return m_ip4s.size() + m_ip6s.size(); }
+
 	// these must be multisets because there can be multiple routing table
 	// entries for a single IP when restrict_routing_ips is set to false
 	std::unordered_multiset<address_v4::bytes_type, ipv4_hash> m_ip4s;
 	std::unordered_multiset<address_v6::bytes_type, ipv6_hash> m_ip6s;
 };

+// Each routing table bucket represents node IDs with a certain number of bits
+// of prefix in common with our own node ID. Each bucket fits 8 nodes (and
+// sometimes more, closer to the top). In order to minimize the number of hops
+// necessary to traverse the DHT, we want the nodes in our buckets to be spread
+// out across all possible "sub-branches". This is what the "classify" refers
+// to. The 3 (or more) bits following the shared bit prefix.
+TORRENT_EXTRA_EXPORT std::uint8_t  classify_prefix(int bucket_idx, bool last_bucket
+	, int bucket_size, node_id nid);
+
 // differences in the implementation from the description in
 // the paper:
 //
@ -308,6 +319,14 @@ private:
 	int const m_bucket_size;
 };

+TORRENT_EXTRA_EXPORT routing_table::add_node_status_t
+replace_node_impl(node_entry const& e, bucket_t& b, ip_set& ips
+	, int bucket_index, int bucket_size_limit, bool last_bucket
+#ifndef TORRENT_DISABLE_LOGGING
+	, dht_logger* log
+#endif
+	);
+
 } } // namespace libtorrent::dht

 #endif // ROUTING_TABLE_HPP
--- a/src/kademlia/routing_table.cpp
+++ b/src/kademlia/routing_table.cpp
@ -108,7 +108,139 @@ bool mostly_verified_nodes(bucket_t const& b)
 	return num_verified >= static_cast<int>(b.size()) * 2 / 3;
 }

-routing_table::routing_table(node_id const& id, udp proto, int bucket_size
+std::uint8_t classify_prefix(int const bucket_idx, bool const last_bucket
+	, int const bucket_size, node_id nid)
+{
+	TORRENT_ASSERT_VAL(bucket_size > 0, bucket_size);
+	TORRENT_ASSERT_VAL(bucket_size <= 256, bucket_size);
+
+	std::uint32_t mask = static_cast<std::uint32_t>(bucket_size) - 1;
+	// bucket sizes must be even powers of two.
+	TORRENT_ASSERT_VAL((mask & static_cast<std::uint32_t>(bucket_size)) == 0, bucket_size);
+
+	int const mask_shift = aux::count_leading_zeros(mask);
+	TORRENT_ASSERT_VAL(mask_shift >= 0, mask_shift);
+	TORRENT_ASSERT_VAL(mask_shift < 8, mask_shift);
+	mask <<= mask_shift;
+	TORRENT_ASSERT_VAL(mask > 0, mask);
+	TORRENT_ASSERT_VAL(bool((mask & 0x80) != 0), mask);
+
+	// the reason to shift one bit extra (except for the last bucket) is that the
+	// first bit *defines* the bucket. That bit will be the same for all entries.
+	// We're not interested in that one. However, the last bucket hasn't split
+	// yet, so it will contain entries from both "sides", so we need to include
+	// the top bit.
+	nid <<= bucket_idx + int(!last_bucket);
+	std::uint8_t const ret = (nid[0] & mask) >> mask_shift;
+	TORRENT_ASSERT_VAL(ret < bucket_size, ret);
+	return ret;
+}
+
+routing_table::add_node_status_t replace_node_impl(node_entry const& e
+	, bucket_t& b, ip_set& ips, int const bucket_index
+	, int const bucket_size_limit, bool const last_bucket
+#ifndef TORRENT_DISABLE_LOGGING
+	, dht_logger* log
+#endif
+	)
+{
+	// if the bucket isn't full, we're not replacing anything, and this function
+	// should not have been called
+	TORRENT_ASSERT(int(b.size()) >= bucket_size_limit);
+
+	bucket_t::iterator j = std::max_element(b.begin(), b.end()
+		, [](node_entry const& lhs, node_entry const& rhs)
+		{ return lhs.fail_count() < rhs.fail_count(); });
+	TORRENT_ASSERT(j != b.end());
+
+	if (j->fail_count() > 0)
+	{
+		// i points to a node that has been marked
+		// as stale. Replace it with this new one
+		ips.erase(j->addr());
+		*j = e;
+		ips.insert(e.addr());
+		return routing_table::node_added;
+	}
+
+	// then we look for nodes with the same 3 bit prefix (or however
+	// many bits prefix the bucket size warrants). If there is no other
+	// node with this prefix, remove the duplicate with the highest RTT.
+	// as the last replacement strategy, if the node we found matching our
+	// bit prefix has higher RTT than the new node, replace it.
+
+	// in order to provide as few lookups as possible before finding
+	// the data someone is looking for, make sure there is an affinity
+	// towards having a good spread of node IDs in each bucket
+	std::uint8_t const to_add_prefix = classify_prefix(bucket_index
+		, last_bucket, bucket_size_limit, e.id);
+
+	// nodes organized by their prefix
+	aux::array<std::vector<bucket_t::iterator>, 128> nodes_storage;
+	auto const nodes = span<std::vector<bucket_t::iterator>>{nodes_storage}.first(bucket_size_limit);
+
+	for (j = b.begin(); j != b.end(); ++j)
+	{
+		std::uint8_t const prefix = classify_prefix(
+			bucket_index, last_bucket, bucket_size_limit, j->id);
+		TORRENT_ASSERT(prefix < nodes.size());
+		nodes[prefix].push_back(j);
+	}
+
+	if (!nodes[to_add_prefix].empty())
+	{
+		j = *std::max_element(nodes[to_add_prefix].begin(), nodes[to_add_prefix].end()
+			, [](bucket_t::iterator lhs, bucket_t::iterator rhs)
+			{ return *lhs < *rhs; });
+
+		// only if e is better than the worst node in this prefix slot do we
+		// replace it. resetting j means we're not replacing it
+		if (!(e < *j)) j = b.end();
+	}
+	else
+	{
+		// there is no node in this prefix slot. We definitely want to add it.
+		// Now we just need to figure out which one to replace
+		std::vector<bucket_t::iterator> replace_candidates;
+		for (auto const& n : nodes)
+		{
+			if (n.size() > 1) replace_candidates.insert(replace_candidates.end(), n.begin(), n.end());
+		}
+
+		// since the bucket is full, and there's no node in the prefix-slot
+		// we're about to add to, there must be at least one prefix slot that
+		// has more than one node.
+		TORRENT_ASSERT(!replace_candidates.empty());
+
+		// from these nodes, pick the "worst" one and replace it
+		j = *std::max_element(replace_candidates.begin(), replace_candidates.end()
+			, [](bucket_t::iterator lhs, bucket_t::iterator rhs)
+			{ return *lhs < *rhs; });
+	}
+
+	if (j != b.end())
+	{
+#ifndef TORRENT_DISABLE_LOGGING
+		if (log != nullptr && log->should_log(dht_logger::routing_table))
+		{
+			log->log(dht_logger::routing_table, "replacing node with better one: %s %s [%s %dms %d] vs. [%s %dms %d]"
+				, aux::to_hex(e.id).c_str(), print_address(e.addr()).c_str()
+				, e.verified ? "verified" : "not-verified", e.rtt
+				, classify_prefix(bucket_index, last_bucket, bucket_size_limit, e.id)
+				, j->verified ? "verified" : "not-verified", j->rtt
+				, classify_prefix(bucket_index, last_bucket, bucket_size_limit, j->id)
+				);
+		}
+#endif
+		ips.erase(j->addr());
+		*j = e;
+		ips.insert(e.addr());
+		return routing_table::node_added;
+	}
+	return routing_table::need_bucket_split;
+}
+
+routing_table::routing_table(node_id const& id, udp const proto, int const bucket_size
 	, dht::settings const& settings
 	, dht_logger* log)
 	:
@ -122,6 +254,8 @@ routing_table::routing_table(node_id const& id, udp proto, int bucket_size
 	, m_last_self_refresh(min_time())
 	, m_bucket_size(bucket_size)
 {
+	// bucket sizes must be a power of 2
+	TORRENT_ASSERT_VAL(((bucket_size - 1) & bucket_size) == 0, bucket_size);
 	TORRENT_UNUSED(log);
 	m_buckets.reserve(30);
 }
@ -369,6 +503,8 @@ node_entry* routing_table::find_node(udp::endpoint const& ep
 	return nullptr;
 }

+// TODO: this need to take bucket "prefix" into account. It should be unified
+// with add_node_impl()
 void routing_table::fill_from_replacements(table_t::iterator bucket)
 {
 	bucket_t& b = bucket->live_nodes;
@ -379,9 +515,7 @@ void routing_table::fill_from_replacements(table_t::iterator bucket)

 	// sort by RTT first, to find the node with the lowest
 	// RTT that is pinged
-	std::sort(rb.begin(), rb.end()
-		, [](node_entry const& lhs, node_entry const& rhs)
-			{ return lhs.rtt < rhs.rtt; });
+	std::sort(rb.begin(), rb.end());

 	while (int(b.size()) < bucket_size && !rb.empty())
 	{
@ -588,7 +722,6 @@ routing_table::add_node_status_t routing_table::add_node_impl(node_entry e)
 	// long to split, and lose nodes (in the case where lower-numbered buckets
 	// are larger)
 	int const bucket_size_limit = bucket_limit(bucket_index);
-	int const next_bucket_size_limit = bucket_limit(bucket_index + 1);

 	bucket_t::iterator j;

@ -672,7 +805,7 @@ ip_ok:
 	// bucket's size limit. This makes use split the low-numbered buckets split
 	// earlier when we have larger low buckets, to make it less likely that we
 	// lose nodes
-	if (e.pinged() && int(b.size()) < (can_split ? next_bucket_size_limit : bucket_size_limit))
+	if (e.pinged() && int(b.size()) < bucket_size_limit)
 	{
 		if (b.empty()) b.reserve(bucket_size_limit);
 		b.push_back(e);
@ -682,154 +815,23 @@ ip_ok:

 	// if there is no room, we look for nodes marked as stale
 	// in the k-bucket. If we find one, we can replace it.
-	// then we look for nodes with the same 3 bit prefix (or however
-	// many bits prefix the bucket size warrants). If there is no other
-	// node with this prefix, remove the duplicate with the highest RTT.
-	// as the last replacement strategy, if the node we found matching our
-	// bit prefix has higher RTT than the new node, replace it.
+
+	// A node is considered stale if it has failed at least one
+	// time. Here we choose the node that has failed most times.
+	// If we don't find one, place this node in the replacement-
+	// cache and replace any nodes that will fail in the future
+	// with nodes from that cache.
+
+	bool const last_bucket = bucket_index + 1 == int(m_buckets.size());

 	if (e.confirmed())
 	{
-		// A node is considered stale if it has failed at least one
-		// time. Here we choose the node that has failed most times.
-		// If we don't find one, place this node in the replacement-
-		// cache and replace any nodes that will fail in the future
-		// with nodes from that cache.
-
-		j = std::max_element(b.begin(), b.end()
-			, [](node_entry const& lhs, node_entry const& rhs)
-			{ return lhs.fail_count() < rhs.fail_count(); });
-		TORRENT_ASSERT(j != b.end());
-
-		if (j->fail_count() > 0)
-		{
-			// i points to a node that has been marked
-			// as stale. Replace it with this new one
-			m_ips.erase(j->addr());
-			*j = e;
-			m_ips.insert(e.addr());
-			return node_added;
-		}
-
-		// in order to provide as few lookups as possible before finding
-		// the data someone is looking for, make sure there is an affinity
-		// towards having a good spread of node IDs in each bucket
-
-		int mask = bucket_size_limit - 1;
-		int mask_shift = 0;
-		TORRENT_ASSERT_VAL(mask > 0, mask);
-		while ((mask & 0x80) == 0)
-		{
-			mask <<= 1;
-			++mask_shift;
-		}
-
-		// in case bucket_size_limit is not an even power of 2
-		mask = (0xff << mask_shift) & 0xff;
-
-		// pick out all nodes that have the same prefix as the new node
-		std::vector<bucket_t::iterator> nodes;
-		bool force_replace = false;
-
-		// the last bucket is special, since it hasn't been split yet, it
-		// includes that top bit as well
-		int const prefix_offset =
-			bucket_index + 1 == int(m_buckets.size()) ? bucket_index : bucket_index + 1;
-
-		{
-			node_id id = e.id;
-			id <<= prefix_offset;
-			int const candidate_prefix = id[0] & mask;
-
-			for (j = b.begin(); j != b.end(); ++j)
-			{
-				if (!matching_prefix(j->id, mask, candidate_prefix, prefix_offset)) continue;
-				nodes.push_back(j);
-			}
-		}
-
-		if (!nodes.empty())
-		{
-			j = *std::max_element(nodes.begin(), nodes.end()
-				, [](bucket_t::iterator lhs, bucket_t::iterator rhs)
-				{ return *lhs < *rhs; });
-		}
-		else
-		{
-			// there is no node in this prefix-slot, there may be some
-			// nodes sharing a prefix. Find all nodes that do not
-			// have a unique prefix
-
-			// find node entries with duplicate prefixes in O(1)
-			aux::vector<bucket_t::iterator> prefix(aux::numeric_cast<std::size_t>(int(1 << (8 - mask_shift))), b.end());
-			TORRENT_ASSERT(int(prefix.size()) >= bucket_size_limit);
-
-			// the begin iterator from this object is used as a placeholder
-			// for an occupied slot whose node has already been added to the
-			// duplicate nodes list.
-			bucket_t placeholder;
-
-			nodes.reserve(b.size());
-			for (j = b.begin(); j != b.end(); ++j)
-			{
-				node_id id = j->id;
-				id <<= prefix_offset;
-				int this_prefix = (id[0] & mask) >> mask_shift;
-				TORRENT_ASSERT(this_prefix >= 0);
-				TORRENT_ASSERT(this_prefix < int(prefix.size()));
-				if (prefix[this_prefix] != b.end())
-				{
-					// there's already a node with this prefix. Remember both
-					// duplicates.
-					nodes.push_back(j);
-
-					if (prefix[this_prefix] != placeholder.begin())
-					{
-						nodes.push_back(prefix[this_prefix]);
-						prefix[this_prefix] = placeholder.begin();
-					}
-				}
-			}
-
-			if (!nodes.empty())
-			{
-				// from these nodes, pick the one with the highest RTT
-				// and replace it
-
-				auto k = std::max_element(nodes.begin(), nodes.end()
-					, [](bucket_t::iterator lhs, bucket_t::iterator rhs)
-					{ return *lhs < *rhs; });
-
-				// in this case, we would really rather replace the node even if
-				// the new node has higher RTT, because it fills a new prefix that we otherwise
-				// don't have.
-				force_replace = true;
-				j = *k;
-			}
-			else
-			{
-				j = std::max_element(b.begin(), b.end());
-			}
-		}
-
-		if (j != b.end() && (force_replace || e < *j))
-		{
+		auto const ret = replace_node_impl(e, b, m_ips, bucket_index, bucket_size_limit, last_bucket
 #ifndef TORRENT_DISABLE_LOGGING
-			if (m_log != nullptr && m_log->should_log(dht_logger::routing_table))
-			{
-				m_log->log(dht_logger::routing_table, "replacing node with better one: %s %s %s %dms vs. %s %dms"
-					, aux::to_hex(e.id).c_str(), print_address(e.addr()).c_str()
-					, e.verified ? "verified" : "not-verified", e.rtt
-					, j->verified ? "verified" : "not-verified", j->rtt);
-			}
+			, m_log
 #endif
-			m_ips.erase(j->addr());
-			*j = e;
-			m_ips.insert(e.addr());
-			return node_added;
-		}
-		// in order to keep lookup times small, prefer nodes with low RTTs
-
+			);
+		if (ret != need_bucket_split) return ret;
 	}

 	// if we can't split, try to insert into the replacement bucket
@ -840,7 +842,6 @@ ip_ok:
 		// the bucket, and the bucket is full, we have to
 		// cache this node and wait until some node fails
 		// and then replace it.
-
 		j = std::find_if(rb.begin(), rb.end()
 			, [&e](node_entry const& ne) { return ne.id == e.id; });

@ -861,7 +862,15 @@ ip_ok:
 			// less reliable than this one, that has been pinged
 			j = std::find_if(rb.begin(), rb.end()
 				, [] (node_entry const& ne) { return !ne.pinged(); });
-			if (j == rb.end()) j = rb.begin();
+			if (j == rb.end())
+			{
+				auto const ret = replace_node_impl(e, rb, m_ips, bucket_index, m_bucket_size, last_bucket
+#ifndef TORRENT_DISABLE_LOGGING
+					, nullptr
+#endif
+					);
+				return ret == node_added ? node_added : failed_to_add;
+			}
 			m_ips.erase(j->addr());
 			rb.erase(j);
 		}
--- a/test/test_dht.cpp
+++ b/test/test_dht.cpp
@ -688,35 +688,25 @@ void print_state(std::ostream& os, routing_table const& table)
 			, int(i->replacements.size()));
 		if (cursor > int(buf.size()) - 500) buf.resize(buf.size() * 3 / 2);

-		int id_shift;
-		// the last bucket is special, since it hasn't been split yet, it
-		// includes that top bit as well
-		if (bucket_index + 1 == int(table.buckets().size()))
-			id_shift = bucket_index;
-		else
-			id_shift = bucket_index + 1;
+		bucket_t nodes = i->live_nodes;

-		for (bucket_t::const_iterator j = i->live_nodes.begin()
-			, end2(i->live_nodes.end()); j != end2; ++j)
+		std::sort(nodes.begin(), nodes.end()
+			, [](node_entry const& lhs, node_entry const& rhs)
+			{ return lhs.id < rhs.id; }
+		);
+
+		for (auto j = nodes.begin(); j != nodes.end(); ++j)
 		{
-			int bucket_size_limit = table.bucket_limit(bucket_index);
-			std::uint32_t top_mask = std::uint32_t(bucket_size_limit - 1);
-			int mask_shift = 0;
+			int const bucket_size_limit = table.bucket_limit(bucket_index);
+			TORRENT_ASSERT_VAL(bucket_size_limit <= 256, bucket_size_limit);
 			TORRENT_ASSERT_VAL(bucket_size_limit > 0, bucket_size_limit);
-			while ((top_mask & 0x80) == 0)
-			{
-				top_mask <<= 1;
-				++mask_shift;
-			}
-			top_mask = (0xff << mask_shift) & 0xff;

-			node_id id = j->id;
-			id <<= id_shift;
+			bool const last_bucket = bucket_index + 1 == int(table.buckets().size());
+			int const prefix = classify_prefix(bucket_index, last_bucket
+				, bucket_size_limit, j->id);

 			cursor += std::snprintf(BUFFER_CURSOR_POS
-				, " prefix: %2x id: %s"
-				, ((id[0] & top_mask) >> mask_shift)
-				, aux::to_hex(j->id).c_str());
+				, " prefix: %2x id: %s", prefix, aux::to_hex(j->id).c_str());

 			if (j->rtt == 0xffff)
 			{
@ -730,7 +720,7 @@ void print_state(std::ostream& os, routing_table const& table)
 			}

 			cursor += std::snprintf(BUFFER_CURSOR_POS
-				, " fail: %4d ping: %d dist: %3d"
+				, " fail: %3d ping: %d dist: %3d"
 				, j->fail_count()
 				, j->pinged()
 				, distance_exp(table.id(), j->id));
@ -758,52 +748,29 @@ void print_state(std::ostream& os, routing_table const& table)
 	for (auto i = table.buckets().begin(), end(table.buckets().end());
 		i != end; ++i, ++bucket_index)
 	{
-		int bucket_size_limit = table.bucket_limit(bucket_index);
-
-		// mask out the first 3 bits, or more depending
-		// on the bucket_size_limit
-		// we have all the lower bits set in (bucket_size_limit-1)
-		// but we want the left-most bits to be set. Shift it
-		// until the MSB is set
-		std::uint32_t top_mask = std::uint32_t(bucket_size_limit - 1);
-		int mask_shift = 0;
-		TORRENT_ASSERT_VAL(bucket_size_limit > 0, bucket_size_limit);
-		while ((top_mask & 0x80) == 0)
-		{
-			top_mask <<= 1;
-			++mask_shift;
-		}
-		top_mask = (0xff << mask_shift) & 0xff;
-		bucket_size_limit = int((top_mask >> mask_shift) + 1);
+		int const bucket_size_limit = table.bucket_limit(bucket_index);
 		TORRENT_ASSERT_VAL(bucket_size_limit <= 256, bucket_size_limit);
-		bool sub_buckets[256];
-		std::memset(sub_buckets, 0, sizeof(sub_buckets));
+		TORRENT_ASSERT_VAL(bucket_size_limit > 0, bucket_size_limit);
+		std::array<bool, 256> sub_buckets;
+		sub_buckets.fill(false);

-		int id_shift;
 		// the last bucket is special, since it hasn't been split yet, it
 		// includes that top bit as well
-		if (bucket_index + 1 == int(table.buckets().size()))
-			id_shift = bucket_index;
-		else
-			id_shift = bucket_index + 1;
+		bool const last_bucket = bucket_index + 1 == int(table.buckets().size());

-		for (bucket_t::const_iterator j = i->live_nodes.begin()
-			, end2(i->live_nodes.end()); j != end2; ++j)
+		for (auto const& e : i->live_nodes)
 		{
-			node_id id = j->id;
-			id <<= id_shift;
-			int b = (id[0] & top_mask) >> mask_shift;
-			TORRENT_ASSERT(b >= 0 && b < int(sizeof(sub_buckets)/sizeof(sub_buckets[0])));
-			sub_buckets[b] = true;
+			std::size_t const prefix = static_cast<std::size_t>(
+				classify_prefix(bucket_index, last_bucket, bucket_size_limit, e.id));
+			sub_buckets[prefix] = true;
 		}

-		cursor += std::snprintf(BUFFER_CURSOR_POS
-			, "%2d mask: %2x: [", bucket_index, (top_mask >> mask_shift));
+		cursor += std::snprintf(BUFFER_CURSOR_POS, "%2d: [", bucket_index);

 		for (int j = 0; j < bucket_size_limit; ++j)
 		{
 			cursor += std::snprintf(BUFFER_CURSOR_POS
-				, (sub_buckets[j] ? "X" : " "));
+				, (sub_buckets[static_cast<std::size_t>(j)] ? "X" : " "));
 		}
 		cursor += std::snprintf(BUFFER_CURSOR_POS
 			, "]\n");
@ -1581,6 +1548,8 @@ namespace {

 void test_routing_table(address(&rand_addr)())
 {
+	init_rand_address();
+
 	dht_test_setup t(udp::endpoint(rand_addr(), 20));
 	bdecode_node response;

@ -1589,7 +1558,7 @@ void test_routing_table(address(&rand_addr)())
 	s.extended_routing_table = false;
 	//	s.restrict_routing_ips = false;
 	node_id const nid = to_hash("3123456789abcdef01232456789abcdef0123456");
-	const int bucket_size = 10;
+	const int bucket_size = 8;
 	dht::routing_table table(nid, t.source.protocol(), bucket_size, s, &t.observer);
 	TEST_EQUAL(std::get<0>(table.size()), 0);

@ -1685,8 +1654,6 @@ void test_routing_table(address(&rand_addr)())

 	s.restrict_routing_ips = false;

-	init_rand_address();
-
 	{
 		auto const ep = rand_udp_ep(rand_addr);
 		auto const id = generate_id(ep.address());
@ -1694,16 +1661,16 @@ void test_routing_table(address(&rand_addr)())
 	}

 	nodes.clear();
-	for (int i = 0; i < 7000; ++i)
+	for (int i = 0; i < 10000; ++i)
 	{
 		auto const ep = rand_udp_ep(rand_addr);
 		auto const id = generate_id(ep.address());
 		table.node_seen(id, ep, 20 + (id[19] & 0xff));
 	}
 	std::printf("active buckets: %d\n", table.num_active_buckets());
-	TEST_CHECK(table.num_active_buckets() == 10
-		|| table.num_active_buckets() == 11);
-	TEST_CHECK(std::get<0>(table.size()) >= 10 * 10);
+	TEST_CHECK(table.num_active_buckets() == 11
+		|| table.num_active_buckets() == 12);
+	TEST_CHECK(std::get<0>(table.size()) >= bucket_size * 10);
 	//TODO: 2 test num_global_nodes
 	//TODO: 2 test need_refresh

@ -3032,7 +2999,7 @@ TORRENT_TEST(routing_table_uniform)
 	// 3: 16
 	// 4: 8
 	// i.e. no more than 5 levels
-	TEST_EQUAL(tbl.num_active_buckets(), 5);
+	TEST_EQUAL(tbl.num_active_buckets(), 6);

 	print_state(std::cout, tbl);
 }
@ -3538,7 +3505,7 @@ TORRENT_TEST(dht_verify_node_address)
 	dht::settings s;
 	s.extended_routing_table = false;
 	node_id id = to_hash("3123456789abcdef01232456789abcdef0123456");
-	const int bucket_size = 10;
+	const int bucket_size = 8;
 	dht::routing_table table(id, udp::v4(), bucket_size, s, &observer);
 	std::vector<node_entry> nodes;
 	TEST_EQUAL(std::get<0>(table.size()), 0);
@ -3562,7 +3529,7 @@ TORRENT_TEST(dht_verify_node_address)

 	// incorrect data, wrong id, should cause node to be removed
 	table.node_seen(to_hash("0123456789abcdef01232456789abcdef0123456")
-					, udp::endpoint(addr("4.4.4.4"), 4), 10);
+		, udp::endpoint(addr("4.4.4.4"), 4), 10);
 	table.find_node(id, nodes, 0, 10);

 	TEST_EQUAL(std::get<0>(table.size()), 0);
@ -3834,6 +3801,197 @@ TORRENT_TEST(mostly_verified_nodes)
 	TEST_CHECK(!mostly_verified_nodes({fake_node(false), fake_node(false), fake_node(false)}));
 }

+TORRENT_TEST(classify_prefix)
+{
+	// the last bucket in the routing table
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 0);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 1);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 2);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 3);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 4);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("acdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 5);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("ccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 6);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("ecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 7);
+	TEST_EQUAL(int(classify_prefix(0, true, 8, to_hash("fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 7);
+
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("c0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 0);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("c2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 1);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("c4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 2);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("c6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 3);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("c8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 4);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("cacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 5);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("cccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 6);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("cecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+	TEST_EQUAL(int(classify_prefix(4, true, 8, to_hash("cfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dc0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 0);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dc2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 1);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dc4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 2);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dc6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 3);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dc8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 4);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dcacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 5);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dcccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 6);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dcecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 7);
+	TEST_EQUAL(int(classify_prefix(8, true, 8, to_hash("dcfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdc"))), 7);
+
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdc0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 0);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdc2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 1);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdc4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 2);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdc6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 3);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdc8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 4);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdcacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 5);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdcccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 6);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdcecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+	TEST_EQUAL(int(classify_prefix(12, true, 8, to_hash("cdcfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+
+	// not the last bucket in the routing table
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdc0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 0);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdc2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 1);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdc4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 2);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdc6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 3);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdc8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 4);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdcacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 5);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdcccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 6);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdcecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+	TEST_EQUAL(int(classify_prefix(11, false, 8, to_hash("cdcfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdc8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 0);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdc9cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 1);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 2);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcbcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 3);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 4);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 5);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 6);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdcfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+	TEST_EQUAL(int(classify_prefix(12, false, 8, to_hash("cdc7cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+
+	// larger bucket
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc0cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 0);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc1cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 1);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc2cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 2);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc3cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 3);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc4cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 4);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc5cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 5);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc6cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 6);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc7cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 7);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc8cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 8);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdc9cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 9);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcacdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 10);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcbcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 11);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcccdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 12);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 13);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcecdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 14);
+	TEST_EQUAL(int(classify_prefix(12, true, 16, to_hash("cdcfcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"))), 15);
+}
+
+namespace {
+node_entry n(ip_set* ips, char const* nid, bool verified = true, int rtt = 0, int failed = 0)
+{
+	node_entry e(rand_udp_ep());
+	if (ips) ips->insert(e.addr());
+	e.verified = verified;
+	e.rtt = static_cast<std::uint16_t>(rtt);
+	e.id = to_hash(nid);
+	if (failed != 0) e.timeout_count = static_cast<std::uint8_t>(failed);
+	return e;
+}
+}
+
+#ifndef TORRENT_DISABLE_LOGGING
+#define LOGGER , nullptr
+#else
+#define LOGGER
+#endif
+TORRENT_TEST(replace_node_impl)
+{
+	// replace specific prefix "slot"
+	{
+	ip_set p;
+	dht::bucket_t b = {
+		n(&p, "1fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "3fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "5fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "7fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "9fffffffffffffffffffffffffffffffffffffff", true, 50), // <== replaced
+		n(&p, "bfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "dfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "ffffffffffffffffffffffffffffffffffffffff", true, 50),
+	};
+	TEST_EQUAL(p.size(), 8);
+	TEST_CHECK(
+		replace_node_impl(n(nullptr, "9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd")
+	, b, p, 0, 8, true LOGGER) == routing_table::node_added);
+	TEST_CHECK(b[4].id == to_hash("9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"));
+	TEST_EQUAL(p.size(), 8);
+	}
+
+	// only try to replace specific prefix "slot", and if we fail (RTT is
+	// higher), don't replace anything else
+	{
+	ip_set p;
+	dht::bucket_t b = {
+		n(&p, "1fffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "3fffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "5fffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "7fffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "9fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "bfffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "dfffffffffffffffffffffffffffffffffffffff", true, 500),
+		n(&p, "ffffffffffffffffffffffffffffffffffffffff", true, 500),
+	};
+	TEST_EQUAL(p.size(), 8);
+	TEST_CHECK(
+		replace_node_impl(n(nullptr, "9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd", true, 100)
+	, b, p, 0, 8, true LOGGER) != routing_table::node_added);
+	TEST_CHECK(b[4].id == to_hash("9fffffffffffffffffffffffffffffffffffffff"));
+	TEST_EQUAL(p.size(), 8);
+	}
+
+	// if there are multiple candidates to replace, pick the one with the highest
+	// RTT. We're picking the prefix slots with duplicates
+	{
+	ip_set p;
+	dht::bucket_t b = {
+		n(&p, "1fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "3fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "5fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "7fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "bfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "bfffffffffffffffffffffffffffffffffffffff", true, 51), // <== replaced
+		n(&p, "dfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "ffffffffffffffffffffffffffffffffffffffff", true, 50),
+	};
+	TEST_EQUAL(p.size(), 8);
+	TEST_CHECK(
+		replace_node_impl(n(nullptr, "9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd", true, 50)
+	, b, p, 0, 8, true LOGGER) == routing_table::node_added);
+	TEST_CHECK(b[5].id == to_hash("9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"));
+	TEST_EQUAL(p.size(), 8);
+	}
+
+	// if there is a node with fail count > 0, replaec that, regardless of
+	// anything else
+	{
+	ip_set p;
+	dht::bucket_t b = {
+		n(&p, "1fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "3fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "5fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "7fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "9fffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "bfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "dfffffffffffffffffffffffffffffffffffffff", true, 50),
+		n(&p, "ffffffffffffffffffffffffffffffffffffffff", true, 50, 1), // <== replaced
+	};
+	TEST_EQUAL(p.size(), 8);
+	TEST_CHECK(
+		replace_node_impl(n(nullptr, "9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd", true, 50)
+	, b, p, 0, 8, true LOGGER) == routing_table::node_added);
+	TEST_CHECK(b[7].id == to_hash("9fcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcdcd"));
+	TEST_EQUAL(p.size(), 8);
+	}
+}
+
 // TODO: test obfuscated_get_peers

 #else