Checkpoint.

Add disk caching code and first draft of fractional index.
2025-01-24 11:41:58 +00:00 · 2004-03-12 18:28:14 +00:00 · 2004-03-12 18:28:14 +00:00 · 9ffbb5adca
commit 9ffbb5adca
parent 7c5190d2c8
15 changed files with 453 additions and 366 deletions
--- a/src/cmd/venti/buildbuck.c
+++ b/src/cmd/venti/buildbuck.c
@ -80,7 +80,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib)

 	buck = TWID32;
 	ib->n = 0;
-	ib->next = 0;
+	ib->depth = 0;
 	while(ies->n){
 		b = peekientry(ies);
 		if(b == nil)
--- a/src/cmd/venti/buildindex.c
+++ b/src/cmd/venti/buildindex.c
@ -7,15 +7,9 @@ writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b)
 {
 	ISect *is;

-	is = findisect(ix, buck);
-	if(is == nil){
-		seterr(EAdmin, "bad math in writebucket");
+	is = findibucket(ix, buck, &buck);
+	if(is == nil)
 		return -1;
-	}
-	if(buck < is->start || buck >= is->stop)
-		seterr(EAdmin, "index write out of bounds: %d not in [%d,%d)\n",
-				buck, is->start, is->stop);
-	buck -= is->start;
 	qlock(&stats.lock);
 	stats.indexwrites++;
 	qunlock(&stats.lock);
@ -47,7 +41,7 @@ buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
 	ib.data = b->data + IBucketSize;
 	zib.data = z->data + IBucketSize;
 	zib.n = 0;
-	zib.next = 0;
+	zib.depth = 0;
 	for(;;){
 		buck = buildbucket(ix, ies, &ib);
 		found += ib.n;
--- a/src/cmd/venti/checkindex.c
+++ b/src/cmd/venti/checkindex.c
@ -11,12 +11,7 @@ checkbucket(Index *ix, u32int buck, IBucket *ib)
 	IEntry ie, eie;
 	int i, ei, ok, c;

-	is = findisect(ix, buck);
-	if(is == nil){
-		seterr(EAdmin, "bad math in checkbuckets");
-		return -1;
-	}
-	buck -= is->start;
+	is = findibucket(ix, buck, &buck);
 	eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
 	if(eb == nil)
 		return -1;
@ -87,7 +82,7 @@ u64int found = 0;
 	ib.data = b->data;
 	zib.data = z->data;
 	zib.n = 0;
-	zib.next = 0;
+	zib.depth = 0;
 	for(;;){
 		buck = buildbucket(ix, ies, &ib);
 		found += ib.n;
--- a/src/cmd/venti/clump.c
+++ b/src/cmd/venti/clump.c
@ -60,7 +60,7 @@ if(0)print("whackedblock %08x %p\n", mainindex->arenas[0], &cl);
 	a = writeiclump(ix, &cl, cb->data);

 	freezblock(cb);
-	if(a == 0)
+	if(a == TWID64)
 		return -1;

 	qlock(&stats.lock);
--- a/src/cmd/venti/conv.c
+++ b/src/cmd/venti/conv.c
@ -488,7 +488,7 @@ void
 unpackibucket(IBucket *b, u8int *buf)
 {
 	b->n = U16GET(buf);
-	b->next = U32GET(&buf[U16Size]);
+	b->depth = U32GET(&buf[U16Size]);
 	b->data = buf + IBucketSize;
 }

@ -496,5 +496,5 @@ void
 packibucket(IBucket *b, u8int *buf)
 {
 	U16PUT(buf, b->n);
-	U32PUT(&buf[U16Size], b->next);
+	U32PUT(&buf[U16Size], b->depth);
 }
--- a/src/cmd/venti/dat.h
+++ b/src/cmd/venti/dat.h
@ -79,7 +79,8 @@ enum

 	ArenaPartVersion	= 3,
 	ArenaVersion		= 4,
-	IndexVersion		= 1,
+	IndexVersion1		= 1,
+	IndexVersion2		= 2,
 	ISectVersion		= 1,

 	/*
@ -116,11 +117,14 @@ enum
 	MaxClumpBlocks		=  (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,

 	/*
-	 * dirty flags 
+	 * dirty flags - order controls disk write order
 	 */
 	DirtyArena		= 1,
+	DirtyIndexSplit,
 	DirtyIndex,
+	DirtyIndexBitmap,
 	DirtyArenaCib,
+	DirtyMax,

 	VentiZZZZZZZZ
 };
@ -367,6 +371,11 @@ struct Index
 	u32int		buckets;		/* last bucket used in disk hash table */
 	u32int		blocksize;
 	u32int		tabsize;		/* max. bytes in index config */
+	u32int		bitblocks;
+	u32int		maxdepth;
+	u32int		bitkeylog;
+	u32int		bitkeymask;
+
 	int		mapalloc;		/* first arena to check when adding a lump */
 	Arena		**arenas;		/* arenas in the mapping */
 	ISect		**sects;		/* sections which hold the buckets */
@ -440,7 +449,7 @@ struct IEntry
 struct IBucket
 {
 	u16int		n;			/* number of active indices */
-	u32int		next;			/* overflow bucket */
+	u32int		depth;		/* depth in version 2 (was overflow in v1) */
 	u8int		*data;
 };

--- a/src/cmd/venti/dcache.c
+++ b/src/cmd/venti/dcache.c
@ -230,7 +230,6 @@ dirtydblock(DBlock *b, int dirty)
 	int odirty;
 	Part *p;

-fprint(2, "dirty %p\n", b);
 	rlock(&dcache.dirtylock);
 	assert(b->ref != 0);
 	assert(b->dirtying == 0);
@ -242,8 +241,16 @@ fprint(2, "dirty %p\n", b);
 	stats.dirtydblocks++;
 	qunlock(&stats.lock);

+	/*
+	 * In general, shouldn't mark any block as more than one
+	 * type, except that split index blocks are a subcase of index
+	 * blocks.  Only clean blocks ever get marked DirtyIndexSplit,
+	 * though, so we don't need the opposite conjunction here.
+	 */
 	if(b->dirty)
-		assert(b->dirty == dirty);
+		assert(b->dirty == dirty
+			|| (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
+
 	odirty = b->dirty;
 	b->dirty = dirty;
 	p = b->part;
@ -533,7 +540,7 @@ flushtimerproc(void *v)
 static void
 flushproc(void *v)
 {
-	int i, n;
+	int i, j, n;
 	DBlock *b, **write;

 	USED(v);
@ -575,25 +582,10 @@ flushproc(void *v)

 		qsort(write, n, sizeof(write[0]), writeblockcmp);

-		/*
-		 * At the beginning of the array are the arena blocks.
-		 */
-		fprint(2, "flushproc: write arena blocks\n");
+		/* Write each stage of blocks out. */
 		i = 0;
-		i += parallelwrites(write+i, write+n, DirtyArena);
-
-		/*
-		 * Next are the index blocks.
-		 */
-		fprint(2, "flushproc: write index blocks\n");
-		i += parallelwrites(write+i, write+n, DirtyIndex);
-
-		/*
-		 * Finally, the arena clump info blocks.
-		 */
-		fprint(2, "flushproc: write cib blocks\n");
-		i += parallelwrites(write+i, write+n, DirtyArenaCib);
-
+		for(j=1; j<DirtyMax; j++)
+			i += parallelwrites(write+i, write+n, j);
 		assert(i == n);

 		fprint(2, "flushproc: update dirty bits\n");
--- a/src/cmd/venti/fns.h
+++ b/src/cmd/venti/fns.h
@ -20,7 +20,6 @@ void		*erealloc(void *, ulong);
 char		*estrdup(char*);
 void		*ezmalloc(ulong);
 Arena		*findarena(char *name);
-ISect		*findisect(Index *ix, u32int buck);
 int		flushciblocks(Arena *arena);
 void		flushdcache(void);
 void		flushqueue(void);
@ -57,6 +56,7 @@ int		initventi(char *config);
 void		insertlump(Lump *lump, Packet *p);
 int		insertscore(u8int *score, IAddr *ia, int write);
 ZBlock		*loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
+DBlock	*loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
 int		loadientry(Index *index, u8int *score, int type, IEntry *ie);
 void		logerr(int severity, char *fmt, ...);
 Lump		*lookuplump(u8int *score, int type);
--- a/src/cmd/venti/httpd.c
+++ b/src/cmd/venti/httpd.c
@ -137,14 +137,12 @@ httpproc(void *v)
 	c = v;

 	for(t = 15*60*1000; ; t = 15*1000){
-fprint(2, "httpd: get headers\n");
 		if(hparsereq(c, t) < 0)
 			break;

 		ok = -1;
 		for(i = 0; i < MaxObjs && objs[i].name[0]; i++){
 			if(strcmp(c->req.uri, objs[i].name) == 0){
-fprint(2, "httpd: call function %p\n", objs[i].f);
 				ok = (*objs[i].f)(c);
 				break;
 			}
@ -158,9 +156,7 @@ fprint(2, "httpd: call function %p\n", objs[i].f);
 		if(ok < 0)
 			break;
 	}
-print("httpd cleanup %d\n", c->hin.fd);
 	hreqcleanup(c);
-print("close %d\n", c->hin.fd);
 	close(c->hin.fd);
 	free(c);
 }
@ -239,12 +235,9 @@ estats(HConnect *c)

 	r = preqtext(c);
 	if(r < 0)
-{
-fprint(2, "preqtext failed\n");
 		return r;
-}

-fprint(2, "write stats\n");
+
 	hout = &c->hout;
 	hprint(hout, "lump writes=%,ld\n", stats.lumpwrites);
 	hprint(hout, "lump reads=%,ld\n", stats.lumpreads);
@ -277,21 +270,21 @@ fprint(2, "write stats\n");
 	hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss);
 	hprint(hout, "disk cache reads=%,ld\n", stats.pcreads);
 	hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads);
-fprint(2, "write new stats\n");
+
 	hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks);
 	hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites,
 		percent(stats.absorbedwrites, stats.dirtydblocks));

-fprint(2, "back to old stats\n");
+	hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes);
+	hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n", 
+		stats.dcacheflushwrites, stats.dcacheflushwrites/stats.dcacheflushes);

 	hprint(hout, "disk writes=%,ld\n", stats.diskwrites);
 	hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites);
 	hprint(hout, "disk reads=%,ld\n", stats.diskreads);
 	hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads);

-fprint(2, "hflush stats\n");
 	hflush(hout);
-fprint(2, "done with stats\n");
 	return 0;
 }

--- a/src/cmd/venti/icache.c
+++ b/src/cmd/venti/icache.c
@ -58,7 +58,6 @@ lookupscore(u8int *score, int type, IAddr *ia, int *rac)
 	IEntry d, *ie, *last;
 	u32int h;

-fprint(2, "lookupscore %V %d\n", score, type);
 	qlock(&stats.lock);
 	stats.iclookups++;
 	qunlock(&stats.lock);
--- a/src/cmd/venti/index.c
+++ b/src/cmd/venti/index.c
@ -1,101 +1,110 @@
 /*
- * Index, mapping scores to log positions.  The log data mentioned in
- * the index _always_ goes out to disk before the index blocks themselves.
- * A counter in the arena tail records which logged blocks have been
- * successfully indexed.  The ordering of dirtydcache calls along with
- * the flags passed to dirtydcache ensure the proper write ordering.
- * 
- * For historical reasons, there are two indexing schemes. In both,
- * the index is broken up into some number of fixed-size (say, 8kB)
- * buckets holding index entries.  An index entry is about 40 bytes.
- * The index can be spread across many disks, although in small
- * configurations it is not uncommon for the index and arenas to be
- * on the same disk.
- *  *
- * In the first scheme, the many buckets are treated as a giant on-disk
- * hash table.  If there are N buckets, then the top 32 bits of the
- * score are used as an index into the hash table, with each bucket
- * holding 2^32 / N of the index space.  The index must be sized so
- * that a bucket can't ever overflow.  Assuming that a typical compressed
- * data block is about 4000 bytes, the index size is expected to be
- * about 1% of the total data size.  Since scores are essentially
- * random, they will be distributed evenly among the buckets, so all
- * buckets should be about the same fullness.  A factor of 5 gives us
- * a wide comfort boundary, so the index storage is suggested to be
- * 5% of the total data storage.
- * 
- * Unfortunately, this very sparse index does not make good use of the
- * disk -- most of it is empty, and disk reads, which are costly because
- * of the random seek to get to an arbitrary bucket, tend to bring in
- * only a few entries, making them hardly cost effective.  The second
- * scheme is a variation on the first scheme that tries to lay out the
- * index in a denser format on the disk.  In this scheme, the index
- * buckets are organized in a binary tree, with all data at the leaf
- * nodes.  Bucket numbers are easiest to treat in binary.  In the
- * beginning, there is a single bucket with 0-bit number "".  When a
- * bucket with number x fills, it splits into buckets 0x and 1x.  Since
- * x and 0x are the same number, this means that half the bucket space
- * is assigned to a new bucket, 1x.  So "" splits into 0 and 1, 1
- * splits into 01 and 11, and so on.  The bucket number determines the
- * placement on disk, and the bucket header includes the number of
- * bits represented by the bucket.  To find the right bucket for a
- * given score with top 32-bits x, read bucket "" off disk and check
- * its depth.  If it is zero, we're done.  If x doesn't match the
- * number of bits in 0's header, we know that the block has split, so
- * we use the last 1 bit of x to load a new block (perhaps the same
- * one) and repeat, using successively more bits of x until we find
- * the block responsible for x.  Note that we're using bits from the
- * _right_ not the left.  This gives the "split into 0x and 1x" property
- * needed by the tree and is easier than using the reversal of the
- * bits on the left.
- *  *
- * At the moment, this second scheme sounds worse than the first --
- * there are log n disk reads to find a block instead of just 1.  But
- * we can keep the tree structure in memory, using 1 bit per block to
- * keep track of whether that block has been allocated.  Want to know
- * whether block x has been split?  Check whether 1x is allocated.  1
- * bit per 8kB gives us an in-use bitmap 1/65536 the size of the index.
- * The index data is 1/100 the size of the arena data, explained above.
- * In this scheme, after the first block split, the index is always
- * at least half full (proof by induction), so it is at most 2x the
- * size of the index data.  This gives a bitmap size of 2/6,553,600
- * of the data size.  Let's call that one millionth.  So each terabyte
- * of storage requires one megabyte of free bitmap.  The bitmap is
- * going to be accessed so much that it will be effectively pinned in
- * the cache.  So it still only takes one disk read to find the block
- * -- the tree walking can be done by consulting the in-core bitmap
- * describing the tree structure.
- *  *
- * Now we have to worry about write ordering, though.  What if the
- * bitmap ends up out of sync with the index blocks?  When block x
- * splits into 0x and 1x, causing an update to bitmap block b, the
- * dcache flushing code makes sure that the writes happen in this
- * order: first 1x, then 0x, then the bitmap.  Writing 1x before 0x
- * makes sure we write the split-off entries to disk before we discard
- * them from 0x.  Writing the bitmap after both simplifies the following
- * case analysis.
- * 
- * If Venti is interrupted while flushing blocks to disk, the state
- * of the disk upon next startup can be one of the following:
- *  *
-
- * (a) none of 0x, 1x, and b are written
- *	Looks like nothing happened - use as is.
+ * Index, mapping scores to log positions. 
 *
- * (b) 1x is written
- *	Since 0x hasn't been rewritten and the bitmap doesn't record 1x
- * 	as being in use, it's like this never happened.  See (a).
- *	This does mean that the bitmap trumps actual disk contents:
- *	no need to zero the index disks anymore.
+ * The index is made up of some number of index sections, each of
+ * which is typically stored on a different disk.  The blocks in all the 
+ * index sections are logically numbered, with each index section 
+ * responsible for a range of blocks.  Blocks are typically 8kB.
 *
- * (c) 0x and 1x are written, but not the bitmap
- *	Writing 0x commits the change.  When we next encounter
- *	0x or 1x on a lookup (we can't get to 1x except through x==0x),
- *	the bitmap will direct us to x, we'll load the block and find out
- * 	that its now 0x, so we update the bitmap.
+ * Index Version 1:
+ * 
+ * The N index blocks are treated as a giant hash table.  The top 32 bits
+ * of score are used as the key for a lookup.  Each index block holds
+ * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
+ * 
+ * The index is sized so that a particular bucket is extraordinarily 
+ * unlikely to overflow: assuming compressed data blocks are 4kB 
+ * on disk, and assuming each block has a 40 byte index entry,
+ * the index data will be 1% of the total data.  Since scores are essentially
+ * random, all buckets should be about the same fullness.
+ * A factor of 5 gives us a wide comfort boundary to account for 
+ * random variation.  So the index disk space should be 5% of the arena disk space.
 *
- * (d) 0x, 1x, and b are written.
- *	Great - just use as is.
+ * Problems with Index Version 1:
+ * 
+ * Because the index size is chosen to handle the worst case load,
+ * the index is very sparse, especially when the Venti server is mostly empty.
+ * This has a few bad properties.
+ * 
+ * Loading an index block (which typically requires a random disk seek)
+ * is a very expensive operation, yet it yields only a couple index entries.
+ * We're not making efficient use of the disk arm.
+ *
+ * Writing a block requires first checking to see if the block already
+ * exists on the server, which in turn requires an index lookup.  When
+ * writing fresh data, these lookups will fail.  The index entry cache 
+ * cannot serve these, so they must go to disk, which is expensive.
+ * 
+ * Thus both the reading and the writing of blocks are held back by
+ * the expense of accessing the index.
+ * 
+ * Index Version 2:
+ * 
+ * The index is sized to be exactly 2^M blocks.  The index blocks are 
+ * logically arranged in a (not exactly balanced) binary tree of depth at
+ * most M.  The nodes are named by bit strings describing the path from
+ * the root to the node.  The root is . (dot).  The left child of the root is .0,
+ * the right child .1.  The node you get to by starting at the root and going
+ * left right right left is .0110.  At the beginning, there is only the root block.
+ * When a block with name .xxx fills, it splits into .xxx0 and .xxx1.
+ * All the index data is kept in the leaves of the tree.
+ *
+ * Index leaf blocks are laid out on disk by interpreting the bitstring as a 
+ * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is
+ * the block halfway into the index, .0110 is at position 6/16, and
+ * .xxx and .xxx0 map to the same block (but only one can be a leaf
+ * node at any given time, so this is okay!).  A cheap implementation of
+ * this is to append zeros to the bit string to make it M bits long.  That's
+ * the integer index block number.
+ *
+ * To find the leaf block that should hold a score, use the bits of the 
+ * score one at a time to walk down the tree to a leaf node.  If the tree
+ * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket
+ * .0 while score 101110101... ends up in bucket .10.  There is no leaf node
+ * .1 because it has split into .10 and .11.
+ *
+ * If we know which disk blocks are in use, we can reconstruct the interior
+ * of the tree: if .xxx1 is in use, then .xxx has been split.  We keep an in-use
+ * bitmap of all index disk blocks to aid in reconstructing the interior of the
+ * tree.  At one bit per index block, the bitmap is small enough to be kept
+ * in memory even on the largest of Venti servers.
+ *
+ * After the root block splits, the index blocks being used will always be
+ * at least half full (averaged across the entire index).  So unlike Version 1,
+ * Index Version 2 is quite dense, alleviating the two problems above.
+ * Index block reads now return many index entries.  More importantly,
+ * small servers can keep most of the index in the disk cache, making them
+ * more likely to handle negative lookups without going to disk.
+ *
+ * As the index becomes more full, Index Version 2's performance
+ * degrades gracefully into Index Version 1.  V2 is really an optimization
+ * for little servers.
+ *
+ * Write Ordering for Index Version 2:
+ * 
+ * Unlike Index Version 1, Version 2 must worry about write ordering
+ * within the index.  What happens if the in-use bitmap is out of sync
+ * with the actual leaf nodes?  What happens if .xxx splits into .xxx0 and
+ * .xxx1 but only one of the new blocks gets written to disk?
+ *
+ * We arrange that when .xxx splits, the .xxx1 block is written first,
+ * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1.
+ * The split is committed by the writing of .xxx0.  This ordering leaves
+ * two possible partial disk writes:
+ *
+ * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if
+ * the split never happened -- we won't think .xxx1 is in use, and we
+ * won't go looking for it.
+ *
+ * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first
+ * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is
+ * out of date, and update the bitmap.
+ *
+ * Backwards Compatibility
+ *
+ * Because there are large Venti servers in use with Index V1, this code
+ * will read either index version, but when asked to create a new index,
+ * will only create V2.
 */

 #include "stdinc.h"
@ -105,6 +114,7 @@
 static int	bucklook(u8int *score, int type, u8int *data, int n);
 static int	writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
 static int	okibucket(IBucket *ib, ISect *is);
+static int	initindex1(Index*);
 static ISect	*initisect1(ISect *is);

 //static QLock	indexlock;	//ZZZ
@ -118,7 +128,7 @@ initindex(char *name, ISect **sects, int n)
 	Index *ix;
 	ISect *is;
 	u32int last, blocksize, tabsize;
-	int i, nbits;
+	int i;

 	if(n <= 0){
 		seterr(EOk, "no index sections to initialize index");
@ -171,21 +181,7 @@ initindex(char *name, ISect **sects, int n)
 	ix->tabsize = tabsize;
 	ix->buckets = last;

-	/* compute number of buckets used for in-use map */
-	nbits = blocksize*8;
-	ix->bitbuckets = (ix->buckets+nbits-1)/nbits;
-
-	last -= ix->bitbuckets;
-	/* 
-	 * compute log of max. power of two not greater than 
-	 * number of remaining buckets.
-	 */
-	for(nbits=0; last>>=1; nbits++)
-		;
-	ix->maxdepth = nbits;
-
-	if((1UL<<ix->maxdepth) > ix->buckets-ix->bitbuckets){
-		seterr(ECorrupt, "inconsistent math for buckets in %s", ix->name);
+	if(initindex1(ix) < 0){
 		freeindex(ix);
 		return nil;
 	}
@ -195,9 +191,37 @@ initindex(char *name, ISect **sects, int n)
 		freeindex(ix);
 		return nil;
 	}
+
 	return ix;
 }

+static int
+initindex1(Index *ix)
+{
+	u32int buckets;
+
+	switch(ix->version){
+	case IndexVersion1:
+		ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
+		buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
+		if(buckets != ix->buckets){
+			seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
+			return -1;
+		}
+		break;
+
+	case IndexVersion2:
+		buckets = ix->buckets - ix->bitblocks;
+		if(ix->buckets < ix->bitblocks || (buckets&(buckets-1)))
+			seterr(ECorrupt, "bucket count not a power of two in %s", ix->name);
+		ix->maxdepth = u64log2(buckets);
+		ix->bitkeylog = u64log2(ix->blocksize*8);
+		ix->bitkeymask = (1<<ix->bitkeylog)-1;
+		break;
+	}
+	return 0;
+}
+
 int
 wbindex(Index *ix)
 {
@ -237,7 +261,7 @@ wbindex(Index *ix)
 }

 /*
- * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' sections arenas
+ * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
 * version, blocksize: u32int
 * name: max. ANameSize string
 * sections, arenas: AMap
@ -246,6 +270,7 @@ int
 outputindex(Fmt *f, Index *ix)
 {
 	if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
+	|| (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0)
 	|| outputamap(f, ix->smap, ix->nsects) < 0
 	|| outputamap(f, ix->amap, ix->narenas) < 0)
 		return -1;
@ -276,7 +301,7 @@ parseindex(IFile *f, Index *ix)
 		return -1;
 	}
 	ix->version = v;
-	if(ix->version != IndexVersion){
+	if(ix->version != IndexVersion1 && ix->version != IndexVersion2){
 		seterr(ECorrupt, "bad version number in %s", f->name);
 		return -1;
 	}
@ -293,11 +318,22 @@ parseindex(IFile *f, Index *ix)
 	 * block size
 	 */
 	if(ifileu32int(f, &v) < 0){
-		seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
+		seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
 		return -1;
 	}
 	ix->blocksize = v;

+	if(ix->version == IndexVersion2){
+		/*
+		 * free bitmap size
+		 */
+		if(ifileu32int(f, &v) < 0){
+			seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name);
+			return -1;
+		}
+		ix->bitblocks = v;
+	}
+
 	if(parseamap(f, &amn) < 0)
 		return -1;
 	ix->nsects = amn.n;
@ -320,8 +356,10 @@ newindex(char *name, ISect **sects, int n)
 	Index *ix;
 	AMap *smap;
 	u64int nb;
-	u32int div, ub, xb, start, stop, blocksize, tabsize;
-	int i, j;
+	u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
+	int i, j, version;
+
+	version = IndexVersion2;

 	if(n < 1){
 		seterr(EOk, "creating index with no index sections");
@ -368,16 +406,27 @@ newindex(char *name, ISect **sects, int n)
 		seterr(EBug, "index too large");
 		return nil;
 	}
-	div = (((u64int)1 << 32) + nb - 1) / nb;
-	ub = (((u64int)1 << 32) - 1) / div + 1;
-	if(div < 100){
-		seterr(EBug, "index divisor too coarse");
-		return nil;
+
+	div = 0;
+	fb = 0;
+	if(version == IndexVersion1){
+		div = (((u64int)1 << 32) + nb - 1) / nb;
+		ub = (((u64int)1 << 32) - 1) / div + 1;
+		if(div < 100){
+			seterr(EBug, "index divisor too coarse");
+			return nil;
+		}
+	}else{
+		fb = (nb+blocksize*8-1)/(blocksize*8);
+		for(ub=1; ub<=((nb-fb)>>1); ub<<=1)
+			;
+		ub += fb;
 	}
 	if(ub > nb){
 		seterr(EBug, "index initialization math wrong");
 		return nil;
 	}
+	xb = nb - ub;

 	/*
 	 * initialize each of the index sections
@ -388,7 +437,6 @@ newindex(char *name, ISect **sects, int n)
 		seterr(EOk, "can't create new index: out of memory");
 		return nil;
 	}
-	xb = nb - ub;
 	start = 0;
 	for(i = 0; i < n; i++){
 		stop = start + sects[i]->blocks - xb / n;
@ -413,15 +461,22 @@ newindex(char *name, ISect **sects, int n)
 		free(smap);
 		return nil;
 	}
-	ix->version = IndexVersion;
+	ix->version = version;
 	namecp(ix->name, name);
 	ix->sects = sects;
 	ix->smap = smap;
 	ix->nsects = n;
 	ix->blocksize = blocksize;
-	ix->div = div;
 	ix->buckets = ub;
 	ix->tabsize = tabsize;
+	ix->div = div;
+	ix->bitblocks = fb;
+
+	if(initindex1(ix) < 0){
+		free(smap);
+		return nil;
+	}
+
 	return ix;
 }

@ -489,7 +544,7 @@ newisect(Part *part, char *name, u32int blocksize, u32int tabsize)
 }

 /*
- * initialize the computed paramaters for an index
+ * initialize the computed parameters for an index
 */
 static ISect*
 initisect1(ISect *is)
@ -606,7 +661,7 @@ writeiclump(Index *ix, Clump *c, u8int *clbuf)
 }

 /*
- * convert an arena index to an relative address address
+ * convert an arena index to an relative arena address
 */
 Arena*
 amapitoa(Index *ix, u64int a, u64int *aa)
@ -665,20 +720,15 @@ loadientry(Index *ix, u8int *score, int type, IEntry *ie)
 	u32int buck;
 	int h, ok;

-	buck = hashbits(score, 32) / ix->div;
 	ok = -1;

 	qlock(&stats.lock);
 	stats.indexreads++;
 	qunlock(&stats.lock);
-	is = findibucket(ix, buck, &buck);
-	if(is == nil)
-		return -1;
-	b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+	b = loadibucket(ix, score, &is, &buck, &ib);
 	if(b == nil)
 		return -1;

-	unpackibucket(&ib, b->data);
 	if(okibucket(&ib, is) < 0)
 		goto out;

@ -707,19 +757,16 @@ storeientry(Index *ix, IEntry *ie)
 	u32int buck;
 	int h, ok;

-	buck = hashbits(ie->score, 32) / ix->div;
 	ok = 0;

 	qlock(&stats.lock);
 	stats.indexwreads++;
 	qunlock(&stats.lock);

-	is = findibucket(ix, buck, &buck);
-	b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+	b = loadibucket(ix, ie->score, &is, &buck, &ib);
 	if(b == nil)
 		return -1;

-	unpackibucket(&ib, b->data);
 	if(okibucket(&ib, is) < 0)
 		goto out;

@ -765,177 +812,14 @@ writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b)
 	return 0;
 }

-/*
- * find the number of the index section holding score
- */
-int
-indexsect(Index *ix, u8int *score)
-{
-	u32int buck;
-	int r, l, m;
-
-	buck = hashbits(score, 32) / ix->div;
-	l = 1;
-	r = ix->nsects - 1;
-	while(l <= r){
-		m = (r + l) >> 1;
-		if(ix->sects[m]->start <= buck)
-			l = m + 1;
-		else
-			r = m - 1;
-	}
-	return l - 1;
-}
-
-/*
- * find the index section which holds bucket #buck.
- */
-static ISect*
-findisect(Index *ix, u32int buck, u32int *ibuck)
-{
-	ISect *is;
-	int r, l, m;
-
-	l = 1;
-	r = ix->nsects - 1;
-	while(l <= r){
-		m = (r + l) >> 1;
-		if(ix->sects[m]->start <= buck)
-			l = m + 1;
-		else
-			r = m - 1;
-	}
-	is = ix->sects[l - 1];
-	if(is->start <= buck && is->stop > buck){
-		*ibuck = buck - is->start;
-		return is;
-	}
-	seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
-	return nil;
-}
-
-static DBlock*
-loadisectblock(Index *ix, u32int buck, int read)
-{
-	ISect *is;
-
-	if((is = findisect(ix, buck, &buck)) == nil)
-		return nil;
-	return getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read);
-}
-
-/*
- * find the index section which holds the logical bucket #buck
- */
-static DBlock*
-loadibucket(Index *ix, u32int buck, IBucket *ib)
-{
-	int d, i, times;
-	u32int ino;
-	DBlock *b;
-	u32int bbuck;
-	IBucket eib;
-
-	times = 0;
-
-top:
-	if(times++ > 2*ix->maxdepth){
-		seterr(EAdmin, "bucket bitmap tree never converges with buckets");
-		return nil;
-	}
-
-	bbuck = -1;
-	b = nil;
-
-	/*
-	 * consider the bits of buck, one at a time, to make the bucket number.
-	 */
-
-	/*
-	 * walk down the bucket tree using the bitmap, which is used so
-	 * often it's almost certain to be in cache.
-	 */
-	ino = 0;
-	for(d=0; d<ix->maxdepth; d++){
-		/* fetch the bitmap that says whether ino has been split */
-		if(bbuck != (ino>>ix->bitlog)){
-			if(b)
-				putdblock(b);
-			bbuck = (ino>>ix->bitlog);
-			if((b = loadisectblock(ix, bbuck, 1)) == nil)
-				return nil;
-		}
-		/* has it been split yet? */
-		if((((u32int*)b->data)[(ino&(ix->bitmask))>>5] & (1<<(ino&31))) == 0){
-			/* no.  we're done */
-			break;
-		}
-	}
-	putdblock(b);
-
-	/*
-	 * continue walking down (or up!) the bucket tree, which may not
-	 * be completely in sync with the bitmap.  we could continue the loop
-	 * here, but it's easiest just to start over once we correct the bitmap.
-	 * corrections should only happen when things get out of sync because
-	 * a crash keeps some updates from making it to disk, so it's not too
-	 * frequent.  we should converge after 2x the max depth, at the very worst
-	 * (up and back down the tree).
-	 */
-	if((b = loadisectblock(ix, ix->bitbuckets+bucketno(buck, d), 1)) == nil)
-		return nil;
-	unpackibucket(&eib, b->data);
-	if(eib.depth > d){
-		/* the bitmap thought this block hadn't split */
-		putdblock(b);
-		if(markblocksplit(buck, d) < 0)
-			return nil;
-		goto top;
-	}
-	if(eib.depth < d){
-		/* the bitmap thought this block had split */
-		putdblock(b);
-		if(markblockunsplit(ix, buck, d) < 0)
-			return nil;
-		goto top;
-	}
-	*ib = eib;
-	return b;
-}
-
-static int
-markblocksplit(Index *ix, u32int buck, int d)
-{
-	u32int ino;
-
-	ino = bucketno(buck, d);
-	if((b = loadisectblock(ix, ino>>ix->bitlog, 1)) == nil)
-		return -1;
-	dirtydblock(b, DirtyIndex);
-	(((u32int*)b->data)[(ino&(ix->bitmask))>>5] |= (1<<(ino&31));
-	putdblock(b);
-	return 0;
-}
-
-static int
-markblockunsplit(Index *ix, u32int buck, int d)
-{
-	/*
-	 * Let's 
-	u32int ino;
-
-	ino = bucketno(buck, d);
-	
-}
-
 static int
 okibucket(IBucket *ib, ISect *is)
 {
-	if(ib->n <= is->buckmax && (ib->next == 0 || ib->next >= is->start && ib->next < is->stop))
+	if(ib->n <= is->buckmax)
 		return 0;

-	seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, next=%lud range=[%lud,%lud)",
-		ib->n, is->buckmax, ib->next, is->start, is->stop);
+	seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)",
+		ib->n, is->buckmax, ib->depth, is->start, is->stop);
 	return -1;
 }

@ -1010,3 +894,223 @@ ientrycmp(const void *vie1, const void *vie2)
 	}
 	return -1;
 }
+
+/*
+ * find the number of the index section holding bucket #buck
+ */
+static int
+indexsect0(Index *ix, u32int buck)
+{
+	int r, l, m;
+
+	l = 1;
+	r = ix->nsects - 1;
+	while(l <= r){
+		m = (r + l) >> 1;
+		if(ix->sects[m]->start <= buck)
+			l = m + 1;
+		else
+			r = m - 1;
+	}
+	return l - 1;
+}
+
+/*
+ * load the index block at bucket #buck
+ */
+static DBlock*
+loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+	ISect *is;
+	DBlock *b;
+
+	is = ix->sects[indexsect0(ix, buck)];
+	if(buck < is->start || is->stop <= buck){
+		seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
+		return nil;
+	}
+
+	buck -= is->start;
+	if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1)) == nil)
+		return nil;
+
+	*pis = is;
+	*pbuck = buck;
+	unpackibucket(ib, b->data);
+	return b;
+}
+
+/*
+ * find the number of the index section holding score
+ */
+static int
+indexsect1(Index *ix, u8int *score)
+{
+	return indexsect0(ix, hashbits(score, 32) / ix->div);
+}
+
+/*
+ * load the index block responsible for score.
+ */
+static DBlock*
+loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+	return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib);
+}
+
+static u32int
+keytobuck(Index *ix, u32int key, int d)
+{
+	/* clear all but top d bits */
+	if(d != 32)
+		key &= ~((1<<(32-d))-1);
+
+	/* truncate to maxdepth bits */
+	if(ix->maxdepth != 32)
+		key >>= 32 - ix->maxdepth;
+
+	return ix->bitblocks + key;
+}
+
+/*
+ * to check whether .xxx has split, check whether .xxx1 is in use.
+ * it might be worth caching the block for future lookups, but for now
+ * let's assume the dcache is good enough.
+ */
+static int
+bitmapop(Index *ix, u32int key, int d, int set)
+{
+	DBlock *b;
+	ISect *is;
+	IBucket ib;
+	u32int buck;
+	int inuse;
+
+	if(d >= ix->maxdepth)
+		return 0;
+
+	/* construct .xxx1 in bucket number format */
+	key = keytobuck(ix, key, d) | (1<<(ix->maxdepth-d-1));
+
+	/* check whether key (now the bucket number for .xxx1) is in use */
+
+	if((b = loadibucket0(ix, key >> ix->bitkeylog, &is, &buck, &ib)) == nil){
+		seterr(ECorrupt, "cannot load in-use bitmap block");
+		return -1;
+	}
+	inuse = ((u32int*)b->data)[(key & ix->bitkeymask)>>5] & (1<<(key&31));
+	if(set && !inuse){
+		dirtydblock(b, DirtyIndexBitmap);
+		((u32int*)b->data)[(key & ix->bitkeymask)>>5] |= (1<<(key&31));
+	}
+	putdblock(b);
+	return inuse;
+}
+
+static int
+issplit(Index *ix, u32int key, int d)
+{
+	return bitmapop(ix, key, d, 0);
+}
+
+static int
+marksplit(Index *ix, u32int key, int d)
+{
+	return bitmapop(ix, key, d, 1);
+}	
+
+/* 
+ * find the number of the index section holding score.
+ * it's not terrible to be wrong once in a while, so we just
+ * do what the bitmap tells us and don't worry about the 
+ * bitmap being out of date.
+ */
+static int
+indexsect2(Index *ix, u8int *score)
+{
+	u32int key;
+	int d, is;
+
+	key = hashbits(score, 32);
+	for(d=0; d<=ix->maxdepth; d++){
+		is = issplit(ix, key, d);
+		if(is == -1)
+			return 0;	/* must return something valid! */
+		if(!is)
+			break;
+	}
+
+	if(d > ix->maxdepth){
+		seterr(EBug, "index bitmap inconsistent with maxdepth");
+		return 0;	/* must return something valid! */
+	}
+
+	return indexsect0(ix, keytobuck(ix, key, d));
+}
+
+/*
+ * load the index block responsible for score. 
+ * (unlike indexsect2, must be correct always.)
+ */
+static DBlock*
+loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+	u32int key;
+	int d, try, is;
+	DBlock *b;
+
+	for(try=0; try<32; try++){
+		key = hashbits(score, 32);
+		for(d=0; d<=ix->maxdepth; d++){
+			is = issplit(ix, key, d);
+			if(is == -1)
+				return nil;
+			if(!is)
+				break;
+		}
+		if(d > ix->maxdepth){
+			seterr(EBug, "index bitmap inconsistent with maxdepth");
+			return nil;
+		}
+
+		if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib)) == nil)
+			return nil;
+
+		if(ib->depth == d)
+			return b;
+
+		if(ib->depth < d){
+			seterr(EBug, "index block has smaller depth than expected -- cannot happen");
+			putdblock(b);
+			return nil;
+		}
+
+		/*
+		 * ib->depth > d, meaning the bitmap was out of date.
+		 * fix the bitmap and try again.
+		 */
+		putdblock(b);
+		if(marksplit(ix, key, d) < 0)
+			return nil;
+	}
+	seterr(EBug, "loadibucket2 failed to sync bitmap with disk!");
+	return nil;
+}
+
+int
+indexsect(Index *ix, u8int *score)
+{
+	if(ix->version == IndexVersion1)
+		return indexsect1(ix, score);
+	return indexsect2(ix, score);
+}
+
+DBlock*
+loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+	if(ix->version == IndexVersion1)
+		return loadibucket1(ix, score, pis, pbuck, ib);
+	return loadibucket2(ix, score, pis, pbuck, ib);
+}
+
+
--- a/src/cmd/venti/lump.c
+++ b/src/cmd/venti/lump.c
@ -81,7 +81,6 @@ writelump(Packet *p, u8int *score, int type, u32int creator)
 		return ok;
 	}

-print("writelump %08x\n", mainindex->arenas[0]);
 	if(queuewrites)
 		return queuewrite(u, p, creator);

--- a/src/cmd/venti/mkfile
+++ b/src/cmd/venti/mkfile
@ -44,9 +44,9 @@ TARG=\
 	fmtarenas\
 	fmtisect\
 	fmtindex\
-	buildindex\
+#	buildindex\
 	checkarenas\
-	checkindex\
+#	checkindex\
 	clumpstats\
 	findscore\
 	rdarena\
--- a/src/cmd/venti/part.c
+++ b/src/cmd/venti/part.c
@ -2,6 +2,8 @@
 #include "dat.h"
 #include "fns.h"

+#define trace 0
+
 u32int	maxblocksize;
 int	readonly;

@ -75,7 +77,7 @@ writepart(Part *part, u64int addr, u8int *buf, u32int n)
 		seterr(ECorrupt, "out of bounds write to partition='%s'", part->name);
 		return -1;
 	}
-	print("write %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("write %s %lud at %llud\n", part->name, n, addr);
 	for(nn = 0; nn < n; nn += m){
 		mm = n - nn;
 		if(mm > MaxIo)
@ -107,7 +109,7 @@ readpart(Part *part, u64int addr, u8int *buf, u32int n)
 		seterr(ECorrupt, "out of bounds read from partition='%s': addr=%lld n=%d size=%lld", part->name, addr, n, part->size);
 		return -1;
 	}
-	print("read %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("read %s %lud at %llud\n", part->name, n, addr);
 	for(nn = 0; nn < n; nn += m){
 		mm = n - nn;
 		if(mm > MaxIo)
--- a/src/cmd/venti/venti.c
+++ b/src/cmd/venti/venti.c
@ -147,8 +147,8 @@ ventiserver(char *addr)

 	while((r = vtgetreq(s)) != nil){
 		r->rx.type = r->tx.type+1;
-		print("req (arenas[0]=%p sects[0]=%p) %F\n",
-			mainindex->arenas[0], mainindex->sects[0], &r->tx);
+	//	print("req (arenas[0]=%p sects[0]=%p) %F\n",
+	//		mainindex->arenas[0], mainindex->sects[0], &r->tx);
 		switch(r->tx.type){
 		default:
 			vtrerror(r, "unknown request");