1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996, 1997, 1998, 1999
* Sleepycat Software. All rights reserved.
*
* @(#)mp.h 11.3 (Sleepycat) 10/6/99
*/
struct __bh; typedef struct __bh BH;
struct __db_mpool; typedef struct __db_mpool DB_MPOOL;
struct __db_mpreg; typedef struct __db_mpreg DB_MPREG;
struct __mcache; typedef struct __mcache MCACHE;
struct __mpool; typedef struct __mpool MPOOL;
struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE;
struct __cmpr; typedef struct __cmpr CMPR;
struct __cmpr_context; typedef struct __cmpr_context CMPR_CONTEXT;
/* We require at least 20K of cache. */
#define DB_CACHESIZE_MIN ( 20 * 1024)
/*
* By default, environments have room for 500 files.
*/
#define DB_MPOOLFILE_DEF 500
/*
* DB_MPOOL --
* Per-process memory pool structure.
*/
struct __db_mpool {
/* These fields need to be protected for multi-threaded support. */
MUTEX *mutexp; /* Structure thread lock. */
/* List of pgin/pgout routines. */
LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
/* List of DB_MPOOLFILE's. */
TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
/* These fields are not thread-protected. */
DB_ENV *dbenv; /* Reference to error information. */
REGINFO reginfo; /* Main shared region. */
int nc_reg; /* N underlying cache regions. */
REGINFO *c_reginfo; /* Underlying cache regions. */
/* I'm not sure if these need to be thread-protected... */
int recursion_level; /* limit recur'n from weak compr'n */
};
/*
* DB_MPREG --
* DB_MPOOL registry of pgin/pgout functions.
*/
struct __db_mpreg {
LIST_ENTRY(__db_mpreg) q; /* Linked list. */
int ftype; /* File type. */
/* Pgin, pgout routines. */
int (*pgin) __P((db_pgno_t, void *, DBT *));
int (*pgout) __P((db_pgno_t, void *, DBT *));
};
/*
* CMPR_CONTEXT --
* Shared compresssion information.
*/
struct __cmpr_context {
#define DB_CMPR_SUFFIX "_weakcmpr"
DB *weakcmpr; /* Free weakcmpr pages pool. */
};
/*
* DB_MPOOLFILE --
* Per-process DB_MPOOLFILE information.
*/
struct __db_mpoolfile {
/* These fields need to be protected for multi-threaded support. */
MUTEX *mutexp; /* Structure thread lock. */
DB_FH fh; /* Underlying file handle. */
u_int32_t ref; /* Reference count. */
/*
* !!!
* This field is a special case -- it's protected by the region lock
* NOT the thread lock. The reason for this is that we always have
* the region lock immediately before or after we modify the field,
* and we don't want to use the structure lock to protect it because
* then I/O (which is done with the structure lock held because of
* the race between the seek and write of the file descriptor) will
* block any other put/get calls using this DB_MPOOLFILE structure.
*/
u_int32_t pinref; /* Pinned block reference count. */
/*
* !!!
* This field is a special case -- it's protected by the region lock
* since it's manipulated only when new files are added to the list.
*/
TAILQ_ENTRY(__db_mpoolfile) q; /* Linked list of DB_MPOOLFILE's. */
/* These fields are not thread-protected. */
DB_MPOOL *dbmp; /* Overlying DB_MPOOL. */
MPOOLFILE *mfp; /* Underlying MPOOLFILE. */
void *addr; /* Address of mmap'd region. */
size_t len; /* Length of mmap'd region. */
/* These fields need to be protected for multi-threaded support. */
#define MP_READONLY 0x01 /* File is readonly. */
#define MP_UPGRADE 0x02 /* File descriptor is readwrite. */
#define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */
#define MP_CMPR 0x08 /* Transparent I/O compression. */
u_int32_t flags;
CMPR_CONTEXT cmpr_context; /* Shared compression information */
};
/*
* NCACHE --
* Select a cache based on the page number. This assumes accesses are
* uniform across pages, which is probably OK -- what we really want to
* avoid is anything that puts all the pages for any single file in the
* same cache, as we expect that file access will be bursty.
*/
#define NCACHE(mp, pgno) \
((pgno) % ((MPOOL *)mp)->nc_reg)
/*
* NBUCKET --
* We make the assumption that early pages of the file are more likely
* to be retrieved than the later pages, which means the top bits will
* be more interesting for hashing as they're less likely to collide.
* That said, as 512 8K pages represents a 4MB file, so only reasonably
* large files will have page numbers with any other than the bottom 9
* bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the
* page, since that should also be unique for the page. We don't want
* to do anything very fancy -- speed is more important to us than using
* good hashing.
*/
#define NBUCKET(mc, mf_offset, pgno) \
(((pgno) ^ ((mf_offset) << 9)) % (mc)->htab_buckets)
/*
* MPOOL --
* Shared memory pool region. One of these is allocated in shared
* memory, and describes the entire pool.
*/
struct __mpool {
SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */
/*
* We single-thread CDB_memp_sync and CDB_memp_fsync calls.
*
* This mutex is intended *only* to single-thread access to the call,
* it is not used to protect the lsn and lsn_cnt fields, the region
* lock is used to protect them.
*/
MUTEX sync_mutex; /* Checkpoint lock. */
DB_LSN lsn; /* Maximum checkpoint LSN. */
u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */
u_int32_t nc_reg; /* Number of underlying REGIONS. */
roff_t c_regids; /* Array of underlying REGION Ids. */
#define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */
u_int32_t flags;
/* HACK!! */
/* a pointers allocated for this structure is (erroneously?) used */
/* in CDB___memp_alloc() to refer to a MCACHE structure. Make sure */
/* the allocation is big enough. */
int dummy [100];
};
/*
* MCACHE --
* The memory pool may be broken up into individual pieces/files. Not
* what we would have liked, but on Solaris you can allocate only a
* little more than 2GB of memory in a single contiguous chunk, and I
* expect to see more systems with similar issues. An MCACHE structure
* describes a backing piece of memory used as a cache.
*/
struct __mcache {
SH_TAILQ_HEAD(__bhq) bhq; /* LRU list of buffer headers. */
int htab_buckets; /* Number of hash table entries. */
roff_t htab; /* Hash table offset. */
DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
};
/*
* MPOOLFILE --
* Shared DB_MPOOLFILE information.
*/
struct __mpoolfile {
SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */
int ftype; /* File type. */
int32_t lsn_off; /* Page's LSN offset. */
u_int32_t clear_len; /* Bytes to clear on page create. */
roff_t path_off; /* File name location. */
roff_t fileid_off; /* File identification location. */
roff_t pgcookie_len; /* Pgin/pgout cookie length. */
roff_t pgcookie_off; /* Pgin/pgout cookie location. */
u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */
db_pgno_t last_pgno; /* Last page in the file. */
db_pgno_t orig_last_pgno; /* Original last page in the file. */
#define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */
#define MP_REMOVED 0x02 /* Backing file has been removed. */
#define MP_TEMP 0x04 /* Backing file is a temporary. */
u_int32_t flags;
DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */
};
/*
* BH_TO_CACHE --
* Return the cache where we can find the specified buffer header.
*/
#define BH_TO_CACHE(dbmp, bhp) \
(dbmp)->c_reginfo[NCACHE((dbmp)->reginfo.primary, (bhp)->pgno)].primary
/*
* DB_CMPR --
* Page compression information
*
* !!!
* There is no need to keep the length of the data wrote
* in the page since it's already encoded in the compressed
* data.
*/
/*
* Convert size to expected compressed size
*/
#define DB_CMPR_DIVIDE(dbenv, size) ((size) >> CDB___memp_cmpr_coefficient(dbenv) )
#define DB_CMPR_MULTIPLY(dbenv, size) ((size) << CDB___memp_cmpr_coefficient(dbenv) )
struct __cmpr {
#define DB_CMPR_FIRST 0x01 /* Head of chain. */
#define DB_CMPR_INTERNAL 0x02 /* Weak compression data. */
#define DB_CMPR_CHAIN 0x04 /* More data in next page. */
#define DB_CMPR_FREE 0x08 /* Not in use. */
u_int16_t flags;
/*
* Filled if DB_CMPR_CHAIN set
*/
db_pgno_t next;
};
/*
* Reserved information at the beginning of each compressed page
*/
#define DB_CMPR_OVERHEAD sizeof(struct __cmpr)
/*
* Size of IO page, without the reserved information
*/
#define DB_CMPR_PAGESIZE(io) (io->pagesize - DB_CMPR_OVERHEAD)
/*
* Pointer to data within raw compressed buffer
*/
#define DB_CMPR_DATA(io) (io->buf + DB_CMPR_OVERHEAD)
/*
* BH --
* Buffer header.
*/
struct __bh {
MUTEX mutex; /* Buffer thread/process lock. */
u_int16_t ref; /* Reference count. */
#define BH_CALLPGIN 0x001 /* Page needs to be reworked... */
#define BH_DIRTY 0x002 /* Page was modified. */
#define BH_DISCARD 0x004 /* Page is useless. */
#define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */
#define BH_TRASH 0x010 /* Page is garbage. */
#define BH_WRITE 0x020 /* Page scheduled for writing. */
#define BH_CMPR 0x040 /* Chain contains valid data. */
#define BH_CMPR_POOL 0x080 /* Chain allocated in pool. */
#define BH_CMPR_OS 0x100 /* Chain allocate with malloc. */
u_int16_t flags;
db_pgno_t *chain; /* Compression chain. */
SH_TAILQ_ENTRY q; /* LRU queue. */
SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
roff_t mf_offset; /* Associated MPOOLFILE offset. */
/*
* !!!
* This array must be size_t aligned -- the DB access methods put PAGE
* and other structures into it, and expect to be able to access them
* directly. (We guarantee size_t alignment in the documentation too.)
*/
u_int8_t buf[1]; /* Variable length data. */
};
#include "mp_ext.h"
|