From e4df6331577682a9cd5ae0a58681be7c4cf8153e Mon Sep 17 00:00:00 2001
From: Arvid Norberg <arvid@libtorrent.org>
Date: Sun, 20 Mar 2011 05:47:27 +0000
Subject: [PATCH] added another disk cache flush algorithm to minimize readback
 when hashing pieces

---
 docs/manual.rst                         |  7 ++-
 examples/client_test.cpp                |  3 +-
 include/libtorrent/disk_io_thread.hpp   | 10 +++-
 include/libtorrent/session_settings.hpp |  2 +-
 parse_session_stats.py                  |  5 +-
 src/disk_io_thread.cpp                  | 63 ++++++++++++++++++++++---
 src/session.cpp                         |  6 ++-
 src/session_impl.cpp                    | 37 +++++++++------
 8 files changed, 101 insertions(+), 32 deletions(-)
diff --git a/docs/manual.rst b/docs/manual.rst
index 2961db8ef..2b8f6c267 100644
--- a/docs/manual.rst
+++ b/docs/manual.rst
@@ -4294,7 +4294,7 @@ session_settings
 		int file_checks_delay_per_block;
 
 		enum disk_cache_algo_t
-		{ lru, largest_contiguous };
+		{ lru, largest_contiguous, avoid_readback };
 
 		disk_cache_algo_t disk_cache_algorithm;
 
@@ -4898,7 +4898,10 @@ flushes the entire piece, in the write cache, that was least recently
 written to. This is specified by the ``session_settings::lru`` enum
 value. ``session_settings::largest_contiguous`` will flush the largest
 sequences of contiguous blocks from the write cache, regarless of the
-piece's last use time.
+piece's last use time. ``session_settings::avoid_readback`` will prioritize
+flushing blocks that will avoid having to read them back in to verify
+the hash of the piece once it's done. This is especially useful for high
+throughput setups, where reading from the disk is especially expensive.
 
 ``read_cache_line_size`` is the number of blocks to read into the read
 cache when a read cache miss occurs. Setting this to 0 is essentially
diff --git a/examples/client_test.cpp b/examples/client_test.cpp
index c20641108..3eec2fadf 100644
--- a/examples/client_test.cpp
+++ b/examples/client_test.cpp
@@ -1845,7 +1845,8 @@ int main(int argc, char* argv[])
 					out += esc("0");
 #endif
 					char const* piece_state[4] = {"", " slow", " medium", " fast"};
-					snprintf(str, sizeof(str), "]%s", piece_state[i->piece_state]);
+					snprintf(str, sizeof(str), "] %2d%s ", cp ? cp->next_to_hash : -1
+						, piece_state[i->piece_state]);
 					out += str;
 					if (cp)
 					{
diff --git a/include/libtorrent/disk_io_thread.hpp b/include/libtorrent/disk_io_thread.hpp
index 0c5e16862..7bb5413db 100644
--- a/include/libtorrent/disk_io_thread.hpp
+++ b/include/libtorrent/disk_io_thread.hpp
@@ -73,6 +73,7 @@ namespace libtorrent
 		int piece;
 		std::vector<bool> blocks;
 		ptime last_use;
+		int next_to_hash;
 		enum kind_t { read_cache = 0, write_cache = 1 };
 		kind_t kind;
 	};
@@ -353,6 +354,8 @@ namespace libtorrent
 			int piece;
 			// storage this piece belongs to
 			boost::intrusive_ptr<piece_manager> storage;
+			// the pointers to the block data
+			boost::shared_array<cached_block_entry> blocks;
 			// the last time a block was writting to this piece
 			// plus the minimum amount of time the block is guaranteed
 			// to stay in the cache
@@ -361,8 +364,11 @@ namespace libtorrent
 			int num_blocks;
 			// used to determine if this piece should be flushed
 			int num_contiguous_blocks;
-			// the pointers to the block data
-			boost::shared_array<cached_block_entry> blocks;
+			// this is the first block that has not yet been hashed
+			// by the partial hasher. When minimizing read-back, this
+			// is used to determine if flushing a range would force us
+			// to read it back later when hashing
+			int next_block_to_hash;
 			
 			std::pair<void*, int> storage_piece_pair() const
 			{ return std::pair<void*, int>(storage.get(), piece); }
diff --git a/include/libtorrent/session_settings.hpp b/include/libtorrent/session_settings.hpp
index cdc87102e..025a207e7 100644
--- a/include/libtorrent/session_settings.hpp
+++ b/include/libtorrent/session_settings.hpp
@@ -751,7 +751,7 @@ namespace libtorrent
 		int file_checks_delay_per_block;
 
 		enum disk_cache_algo_t
-		{ lru, largest_contiguous };
+		{ lru, largest_contiguous, avoid_readback };
 
 		disk_cache_algo_t disk_cache_algorithm;
 
diff --git a/parse_session_stats.py b/parse_session_stats.py
index 34f66df95..8ef2eaa63 100755
--- a/parse_session_stats.py
+++ b/parse_session_stats.py
@@ -62,9 +62,10 @@ gen_report('piece_picker_end_game', ['end game piece picker blocks', 'piece pick
 gen_report('piece_picker', ['piece picks', 'reject piece picks', 'unchoke piece picks', 'incoming redundant piece picks', 'incoming piece picks', 'end game piece picks', 'snubbed piece picks'])
 gen_report('bandwidth', ['% failed payload bytes', '% wasted payload bytes', '% protocol bytes'])
 gen_report('disk_time', ['disk read time', 'disk write time', 'disk queue time', 'disk hash time', 'disk job time', 'disk sort time'])
-gen_report('disk_time2', ['cumulative read time', 'cumulative write time', 'cumulative hash time', 'cumulative job time', 'cumulative sort time'])
-gen_report('disk_cache_hits', ['disk block read', 'read cache hits', 'disk block written'])
+gen_report('disk_time_proportion', ['% read time', '% write time', '% hash time', '% sort time'])
+gen_report('disk_cache_hits', ['disk block read', 'read cache hits', 'disk block written', 'disk read back'])
 gen_report('disk_cache', ['read disk cache size', 'disk cache size', 'disk buffer allocations', 'cache size'])
+gen_report('disk_readback', ['% read back'])
 gen_report('disk_queue', ['disk queue size', 'disk queued bytes'])
 gen_report('waste', ['failed bytes', 'redundant bytes', 'download rate'])
 gen_report('connect_candidates', ['connect candidates'])
diff --git a/src/disk_io_thread.cpp b/src/disk_io_thread.cpp
index 4e341af90..1780c9f04 100644
--- a/src/disk_io_thread.cpp
+++ b/src/disk_io_thread.cpp
@@ -305,6 +305,7 @@ namespace libtorrent
 			torrent_info const& ti = *i->storage->info();
 			if (ti.info_hash() != ih) continue;
 			cached_piece_info info;
+			info.next_to_hash = i->next_block_to_hash;
 			info.piece = i->piece;
 			info.last_use = i->expire;
 			info.kind = cached_piece_info::write_cache;
@@ -320,6 +321,7 @@ namespace libtorrent
 			torrent_info const& ti = *i->storage->info();
 			if (ti.info_hash() != ih) continue;
 			cached_piece_info info;
+			info.next_to_hash = i->next_block_to_hash;
 			info.piece = i->piece;
 			info.last_use = i->expire;
 			info.kind = cached_piece_info::read_cache;
@@ -419,7 +421,10 @@ namespace libtorrent
 		{
 			TORRENT_ASSERT(i->storage);
 			flush_range(const_cast<cached_piece_entry&>(*i), 0, INT_MAX, l);
-			widx.erase(i++);
+			// we want to keep the piece in here to have an accurate
+			// number for next_block_to_hash, if we're in avoid_readback mode
+			if (m_settings.disk_cache_algorithm != session_settings::avoid_readback)
+				widx.erase(i++);
 		}
 
 		if (m_settings.explicit_read_cache) return;
@@ -638,9 +643,9 @@ namespace libtorrent
 			while (blocks > 0)
 			{
 				cache_lru_index_t::iterator i =
-				std::max_element(idx.begin(), idx.end()
-					, boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _1)
-					< boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _2));
+					std::max_element(idx.begin(), idx.end()
+						, boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _1)
+						< boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _2));
 				if (i == idx.end()) return ret;
 				tmp = flush_contiguous_blocks(const_cast<cached_piece_entry&>(*i), l);
 				if (i->num_blocks == 0) idx.erase(i);
@@ -648,6 +653,39 @@ namespace libtorrent
 				ret += tmp;
 			}
 		}
+		else if (m_settings.disk_cache_algorithm == session_settings::avoid_readback)
+		{
+			cache_lru_index_t& idx = m_pieces.get<1>();
+			for (cache_lru_index_t::iterator i = idx.begin(); i != idx.end(); ++i)
+			{
+				cached_piece_entry& p = const_cast<cached_piece_entry&>(*i);
+				if (!i->blocks[i->next_block_to_hash].buf) continue;
+				int piece_size = i->storage->info()->piece_size(i->piece);
+				int blocks_in_piece = (piece_size + m_block_size - 1) / m_block_size;
+				int start = i->next_block_to_hash;
+				int end = start + 1;
+				while (end < blocks_in_piece && i->blocks[end].buf) ++end;
+				tmp = flush_range(p, start, end, l);
+				p.num_contiguous_blocks = contiguous_blocks(p);
+				blocks -= tmp;
+				ret += tmp;
+				if (blocks <= 0) break;
+			}
+
+			// if we still need to flush blocks, flush the largest contiguous blocks
+			// regardless of if we'll have to read them back later
+			while (blocks > 0)
+			{
+				cache_lru_index_t::iterator i =
+					std::max_element(idx.begin(), idx.end()
+						, boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _1)
+						< boost::bind(&disk_io_thread::cached_piece_entry::num_contiguous_blocks, _2));
+				if (i == idx.end()) return ret;
+				tmp = flush_contiguous_blocks(const_cast<cached_piece_entry&>(*i), l);
+				blocks -= tmp;
+				ret += tmp;
+			}
+		}
 		return ret;
 	}
 
@@ -729,6 +767,7 @@ namespace libtorrent
 			--p.num_blocks;
 			++m_cache_stats.blocks_written;
 			--m_cache_stats.cache_size;
+			if (i == p.next_block_to_hash) ++p.next_block_to_hash;
 		}
 
 		ptime done = time_now_hires();
@@ -798,6 +837,7 @@ namespace libtorrent
 		p.expire = time_now() + seconds(j.cache_min_time);
 		p.num_blocks = 1;
 		p.num_contiguous_blocks = 1;
+		p.next_block_to_hash = 0;
 		p.blocks.reset(new (std::nothrow) cached_block_entry[blocks_in_piece]);
 		if (!p.blocks) return -1;
 		int block = j.offset / m_block_size;
@@ -983,6 +1023,7 @@ namespace libtorrent
 		p.expire = time_now() + seconds(j.cache_min_time);
 		p.num_blocks = 0;
 		p.num_contiguous_blocks = 0;
+		p.next_block_to_hash = 0;
 		p.blocks.reset(new (std::nothrow) cached_block_entry[blocks_in_piece]);
 		if (!p.blocks) return -1;
 
@@ -1106,6 +1147,7 @@ namespace libtorrent
 			pe.expire = time_now() + seconds(j.cache_min_time);
 			pe.num_blocks = 0;
 			pe.num_contiguous_blocks = 0;
+			pe.next_block_to_hash = 0;
 			pe.blocks.reset(new (std::nothrow) cached_block_entry[blocks_in_piece]);
 			if (!pe.blocks) return -1;
 			ret = read_into_piece(pe, 0, options, INT_MAX, l);
@@ -2070,7 +2112,9 @@ namespace libtorrent
 							--m_cache_stats.cache_size;
 							--const_cast<cached_piece_entry&>(*p).num_blocks;
 						}
-						else if ((block > 0 && p->blocks[block-1].buf) || (block < blocks_in_piece-1 && p->blocks[block+1].buf))
+						else if ((block > 0 && p->blocks[block-1].buf)
+							|| (block < blocks_in_piece-1 && p->blocks[block+1].buf)
+							|| p->num_blocks == 0)
 						{
 							// update the contiguous blocks counter for this piece. Only if it has
 							// an adjacent block. If it doesn't, we already know it couldn't have
@@ -2091,8 +2135,13 @@ namespace libtorrent
 						idx.modify(p, update_last_use(j.cache_min_time));
 						// we might just have created a contiguous range
 						// that meets the requirement to be flushed. try it
-						flush_contiguous_blocks(const_cast<cached_piece_entry&>(*p)
-							, l, m_settings.write_cache_line_size);
+						// if we're in avoid_readback mode, don't do this. Only flush
+						// pieces when we need more space in the cache (which will avoid
+						// flushing blocks out-of-order) or when we issue a hash job,
+						// wich indicates the piece is completely downloaded
+						if (m_settings.disk_cache_algorithm != session_settings::avoid_readback)
+							flush_contiguous_blocks(const_cast<cached_piece_entry&>(*p)
+								, l, m_settings.write_cache_line_size);
 						if (p->num_blocks == 0) idx.erase(p);
 						test_error(j);
 						TORRENT_ASSERT(!j.storage->error());
diff --git a/src/session.cpp b/src/session.cpp
index 7e6d697e2..34fcc2ea1 100644
--- a/src/session.cpp
+++ b/src/session.cpp
@@ -228,8 +228,10 @@ namespace libtorrent
 		// the max number of bytes pending write before we throttle
 		// download rate
 		set.max_queued_disk_bytes = 100 * 1024 * 1024;
-		// flush write cache based on largest contiguous block
-		set.disk_cache_algorithm = session_settings::largest_contiguous;
+		// flush write cache in a way to minimize the amount we need to
+		// read back once we want to hash-check the piece. i.e. try to
+		// flush all blocks in-order
+		set.disk_cache_algorithm = session_settings::avoid_readback;
 
 		set.explicit_read_cache = false;
 		// prevent fast pieces to interfere with suggested pieces
diff --git a/src/session_impl.cpp b/src/session_impl.cpp
index 0ed1eb5f5..026a4a502 100644
--- a/src/session_impl.cpp
+++ b/src/session_impl.cpp
@@ -959,12 +959,12 @@ namespace aux {
 			":connect candidates"
 			":disk queue limit"
 			":disk queue low watermark"
-			":cumulative job time"
-			":cumulative read time"
-			":cumulative write time"
-			":cumulative hash time"
-			":cumulative sort time"
-			":disk total read back"
+			":% read time"
+			":% write time"
+			":% hash time"
+			":% sort time"
+			":disk read back"
+			":% read back"
 			"\n\n", m_stats_logger);
 	}
 #endif
@@ -2727,12 +2727,19 @@ namespace aux {
 			++peer_ul_rate_buckets[ul_bucket];
 		}
 
+		int low_watermark = m_settings.max_queued_disk_bytes_low_watermark == 0
+			? m_settings.max_queued_disk_bytes / 2
+			: m_settings.max_queued_disk_bytes_low_watermark;
+
 		if (now - m_last_log_rotation > hours(1))
 			rotate_stats_log();
 		
 		if (m_stats_logger)
 		{
 			cache_status cs = m_disk_thread.status();
+
+			int total_job_time = cs.cumulative_job_time == 0 ? 1 : cs.cumulative_job_time;
+
 			fprintf(m_stats_logger
 				, "%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
 				  "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
@@ -2742,8 +2749,8 @@ namespace aux {
 				  "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
 				  "%f\t%f\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
 				  "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
-				  "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t"
-				  "%d\t%d\t%d\t%d\n"
+				  "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%f\t%f\t"
+				  "%f\t%f\t%d\t%f\n"
 				, total_milliseconds(now - m_last_log_rotation) / 1000.f
 				, int(m_stat.total_upload() - m_last_uploaded)
 				, int(m_stat.total_download() - m_last_downloaded)
@@ -2831,13 +2838,13 @@ namespace aux {
 				, m_settings.connections_limit
 				, connect_candidates
 				, int(m_settings.max_queued_disk_bytes)
-				, int(m_settings.max_queued_disk_bytes_low_watermark)
-				, int(cs.cumulative_job_time)
-				, int(cs.cumulative_read_time)
-				, int(cs.cumulative_write_time)
-				, int(cs.cumulative_hash_time)
-				, int(cs.cumulative_sort_time)
-				, cs.total_read_back
+				, low_watermark
+				, float(cs.cumulative_read_time * 100.f / total_job_time)
+				, float(cs.cumulative_write_time * 100.f / total_job_time)
+				, float(cs.cumulative_hash_time * 100.f / total_job_time)
+				, float(cs.cumulative_sort_time * 100.f / total_job_time)
+				, int(cs.total_read_back - m_last_cache_status.total_read_back)
+				, float(cs.total_read_back * 100.f / (cs.blocks_written == 0 ? 1: cs.blocks_written))
 			);
 			m_last_cache_status = cs;
 			m_last_failed = m_total_failed_bytes;