//.........这里部分代码省略.........
entry->flags = flags;
entry->count = count;
entry->bloom_bit_count = bloom_bit_count;
entry->bloom_hash_count = bloom_hash_count;
++cjoin->entries_next;
} else {
/* Merge the join into an existing entry for this index */
if (count != 0 && entry->count != 0 && entry->count != count)
WT_ERR_MSG(session, EINVAL,
"count=%" PRIu64 " does not match "
"previous count=%" PRIu64 " for this index",
count, entry->count);
if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
WT_ERR_MSG(session, EINVAL,
"join has incompatible strategy "
"values for the same index");
/*
* Check against other comparisons (we call them endpoints)
* already set up for this index.
* We allow either:
* - one or more "eq" (with disjunction)
* - exactly one "eq" (with conjunction)
* - exactly one of "gt" or "ge" (conjunction or disjunction)
* - exactly one of "lt" or "le" (conjunction or disjunction)
* - one of "gt"/"ge" along with one of "lt"/"le"
* (currently restricted to conjunction).
*
* Some other combinations, although expressible either do
* not make sense (X == 3 AND X == 5) or are reducible (X <
* 7 AND X < 9). Other specific cases of (X < 7 OR X > 15)
* or (X == 4 OR X > 15) make sense but we don't handle yet.
*/
for (i = 0; i < entry->ends_next; i++) {
end = &entry->ends[i];
range_eq = (range == WT_CURJOIN_END_EQ);
if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
(F_ISSET(end, WT_CURJOIN_END_LT) &&
((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
(end->flags == WT_CURJOIN_END_EQ &&
(range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
!= 0))
WT_ERR_MSG(session, EINVAL,
"join has overlapping ranges");
if (range == WT_CURJOIN_END_EQ &&
end->flags == WT_CURJOIN_END_EQ &&
!F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
WT_ERR_MSG(session, EINVAL,
"compare=eq can only be combined "
"using operation=or");
/*
* Sort "gt"/"ge" to the front, followed by any number
* of "eq", and finally "lt"/"le".
*/
if (!hasins &&
((range & WT_CURJOIN_END_GT) != 0 ||
(range == WT_CURJOIN_END_EQ &&
!F_ISSET(end, WT_CURJOIN_END_GT)))) {
ins = i;
hasins = true;
}
}
/* All checks completed, merge any new configuration now */
entry->count = count;
entry->bloom_bit_count =
WT_MAX(entry->bloom_bit_count, bloom_bit_count);
entry->bloom_hash_count =
WT_MAX(entry->bloom_hash_count, bloom_hash_count);
}
WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
entry->ends_next + 1, &entry->ends));
if (!hasins)
ins = entry->ends_next;
newend = &entry->ends[ins];
memmove(newend + 1, newend,
(entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
entry->ends_next++;
newend->cursor = ref_cursor;
F_SET(newend, range);
/* Open the main file with a projection of the indexed columns. */
if (entry->main == NULL && entry->index != NULL) {
namesize = strlen(cjoin->table->name);
newsize = namesize + entry->index->colconf.len + 1;
WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
snprintf(main_uri, newsize, "%s%.*s",
cjoin->table->name, (int)entry->index->colconf.len,
entry->index->colconf.str);
WT_ERR(__wt_open_cursor(session, main_uri,
(WT_CURSOR *)cjoin, raw_cfg, &entry->main));
}
err: if (main_uri != NULL)
__wt_free(session, main_uri);
return (ret);
}
/*
* __wt_connection_close --
* Close a connection handle.
*/
int
__wt_connection_close(WT_CONNECTION_IMPL *conn)
{
WT_CONNECTION *wt_conn;
WT_DECL_RET;
WT_DLH *dlh;
WT_SESSION_IMPL *s, *session;
u_int i;
wt_conn = &conn->iface;
session = conn->default_session;
/* Shut down the subsystems, ensuring workers see the state change. */
F_SET(conn, WT_CONN_CLOSING);
WT_FULL_BARRIER();
/*
* Clear any pending async operations and shut down the async worker
* threads and system before closing LSM.
*/
WT_TRET(__wt_async_flush(session));
WT_TRET(__wt_async_destroy(session));
/*
* Shut down server threads other than the eviction server, which is
* needed later to close btree handles. Some of these threads access
* btree handles, so take care in ordering shutdown to make sure they
* exit before files are closed.
*/
WT_TRET(__wt_lsm_manager_destroy(session));
/*
* Once the async and LSM threads exit, we shouldn't be opening any
* more files.
*/
F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
WT_FULL_BARRIER();
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, true));
WT_TRET(__wt_sweep_destroy(session));
/* The eviction server is shut down last. */
WT_TRET(__wt_evict_destroy(session));
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
/* Shut down metadata tracking. */
WT_TRET(__wt_meta_track_destroy(session));
/*
* Now that all data handles are closed, tell logging that a checkpoint
* has completed then shut down the log manager (only after closing
* data handles). The call to destroy the log manager is outside the
* conditional because we allocate the log path so that printlog can
* run without running logging or recovery.
*/
if (ret == 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
WT_TRET(__wt_txn_checkpoint_log(
session, true, WT_TXN_LOG_CKPT_STOP, NULL));
WT_TRET(__wt_logmgr_destroy(session));
/* Free memory for collators, compressors, data sources. */
WT_TRET(__wt_conn_remove_collator(session));
WT_TRET(__wt_conn_remove_compressor(session));
WT_TRET(__wt_conn_remove_data_source(session));
WT_TRET(__wt_conn_remove_encryptor(session));
WT_TRET(__wt_conn_remove_extractor(session));
/* Disconnect from shared cache - must be before cache destroy. */
WT_TRET(__wt_conn_cache_pool_destroy(session));
/* Discard the cache. */
WT_TRET(__wt_cache_destroy(session));
/* Discard transaction state. */
__wt_txn_global_destroy(session);
/* Close the lock file, opening up the database to other connections. */
if (conn->lock_fh != NULL)
WT_TRET(__wt_close(session, &conn->lock_fh));
/* Close any optrack files */
if (session->optrack_fh != NULL)
WT_TRET(__wt_close(session, &session->optrack_fh));
/* Close operation tracking */
WT_TRET(__wt_conn_optrack_teardown(session, false));
/* Close any file handles left open. */
WT_TRET(__wt_close_connection_close(session));
/*
* Close the internal (default) session, and switch back to the dummy
//.........这里部分代码省略.........
/*
* __lsm_bloom_create --
* Create a bloom filter for a chunk of the LSM tree that has been
* checkpointed but not yet been merged.
*/
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
WT_BLOOM *bloom;
WT_CURSOR *src;
WT_DECL_RET;
WT_ITEM key;
uint64_t insert_count;
WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));
bloom = NULL;
/*
* This is merge-like activity, and we don't want compacts to give up
* because we are creating a bunch of bloom filters before merging.
*/
++lsm_tree->merge_progressing;
WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
lsm_tree->bloom_config, chunk->count,
lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));
/* Open a special merge cursor just on this chunk. */
WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
F_SET(src, WT_CURSTD_RAW);
WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
/*
* Setup so that we don't hold pages we read into cache, and so
* that we don't get stuck if the cache is full. If we allow
* ourselves to get stuck creating bloom filters, the entire tree
* can stall since there may be no worker threads available to flush.
*/
F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
__wt_bloom_insert(bloom, &key);
}
WT_ERR_NOTFOUND_OK(ret);
WT_TRET(src->close(src));
WT_TRET(__wt_bloom_finalize(bloom));
WT_ERR(ret);
F_CLR(session, WT_SESSION_NO_CACHE);
/* Load the new Bloom filter into cache. */
WT_CLEAR(key);
WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
__wt_verbose(session, WT_VERB_LSM,
"LSM worker created bloom filter %s. "
"Expected %" PRIu64 " items, got %" PRIu64,
chunk->bloom_uri, chunk->count, insert_count);
/* Ensure the bloom filter is in the metadata. */
__wt_lsm_tree_writelock(session, lsm_tree);
F_SET(chunk, WT_LSM_CHUNK_BLOOM);
ret = __wt_lsm_meta_write(session, lsm_tree);
++lsm_tree->dsk_gen;
__wt_lsm_tree_writeunlock(session, lsm_tree);
if (ret != 0)
WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");
err: if (bloom != NULL)
WT_TRET(__wt_bloom_close(bloom));
F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
return (ret);
}
/*
* __wt_block_read_off --
* Read an addr/size pair referenced block into a buffer.
*/
int
__wt_block_read_off(WT_SESSION_IMPL *session,
WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
WT_BLOCK_HEADER *blk;
uint32_t alloc_size, page_cksum;
WT_VERBOSE_RET(session, read,
"off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
(uintmax_t)offset, size, cksum);
#ifdef HAVE_DIAGNOSTIC
/*
* In diagnostic mode, verify the block we're about to read isn't on
* either the available or discard lists.
*
* Don't check during salvage, it's possible we're reading an already
* freed overflow page.
*/
if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
WT_RET(
__wt_block_misplaced(session, block, "read", offset, size));
#endif
/*
* Grow the buffer as necessary and read the block. Buffers should be
* aligned for reading, but there are lots of buffers (for example, file
* cursors have two buffers each, key and value), and it's difficult to
* be sure we've found all of them. If the buffer isn't aligned, it's
* an easy fix: set the flag and guarantee we reallocate it. (Most of
* the time on reads, the buffer memory has not yet been allocated, so
* we're not adding any additional processing time.)
*/
if (F_ISSET(buf, WT_ITEM_ALIGNED))
alloc_size = size;
else {
F_SET(buf, WT_ITEM_ALIGNED);
alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10);
}
WT_RET(__wt_buf_init(session, buf, alloc_size));
WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
buf->size = size;
blk = WT_BLOCK_HEADER_REF(buf->mem);
blk->cksum = 0;
page_cksum = __wt_cksum(buf->mem,
F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
if (cksum != page_cksum) {
if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
__wt_errx(session,
"read checksum error [%"
PRIu32 "B @ %" PRIuMAX ", %"
PRIu32 " != %" PRIu32 "]",
size, (uintmax_t)offset, cksum, page_cksum);
return (WT_ERROR);
}
WT_CSTAT_INCR(session, block_read);
WT_CSTAT_INCRV(session, block_byte_read, size);
return (0);
}
/*
* __wt_txn_recover --
* Run recovery.
*/
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_CURSOR *metac;
WT_DECL_RET;
WT_RECOVERY r;
struct WT_RECOVERY_FILE *metafile;
char *config;
int needs_rec, was_backup;
conn = S2C(session);
WT_CLEAR(r);
WT_INIT_LSN(&r.ckpt_lsn);
was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;
/* We need a real session for recovery. */
WT_RET(__wt_open_session(conn, NULL, NULL, &session));
F_SET(session, WT_SESSION_NO_LOGGING);
r.session = session;
WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
metafile = &r.files[WT_METAFILE_ID];
metafile->c = metac;
/*
* If no log was found (including if logging is disabled), or if the
* last checkpoint was done with logging disabled, recovery should not
* run. Scan the metadata to figure out the largest file ID.
*/
if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
WT_ERR(__recovery_file_scan(&r));
conn->next_file_id = r.max_fileid;
goto done;
}
/*
* First, do a pass through the log to recover the metadata, and
* establish the last checkpoint LSN. Skip this when opening a hot
* backup: we already have the correct metadata in that case.
*/
if (!was_backup) {
r.metadata_only = 1;
if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
WT_ERR(__wt_log_scan(session,
NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
else {
/*
* Start at the last checkpoint LSN referenced in the
* metadata. If we see the end of a checkpoint while
* scanning, we will change the full scan to start from
* there.
*/
r.ckpt_lsn = metafile->ckpt_lsn;
WT_ERR(__wt_log_scan(session,
&metafile->ckpt_lsn, 0, __txn_log_recover, &r));
}
}
/* Scan the metadata to find the live files and their IDs. */
WT_ERR(__recovery_file_scan(&r));
/*
* We no longer need the metadata cursor: close it to avoid pinning any
* resources that could block eviction during recovery.
*/
r.files[0].c = NULL;
WT_ERR(metac->close(metac));
/*
* Now, recover all the files apart from the metadata.
* Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
*/
r.metadata_only = 0;
WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
"Main recovery loop: starting at %u/%" PRIuMAX,
r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
/*
* Check if the database was shut down cleanly. If not
* return an error if the user does not want automatic
* recovery.
*/
if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR))
WT_ERR(WT_RUN_RECOVERY);
/*
* Always run recovery even if it was a clean shutdown.
* We can consider skipping it in the future.
*/
if (WT_IS_INIT_LSN(&r.ckpt_lsn))
WT_ERR(__wt_log_scan(session, NULL,
WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
__txn_log_recover, &r));
//.........这里部分代码省略.........
/*
* __lsm_bloom_create --
* Create a bloom filter for a chunk of the LSM tree that has been
* checkpointed but not yet been merged.
*/
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
WT_BLOOM *bloom;
WT_CURSOR *src;
WT_DECL_RET;
WT_ITEM buf, key;
WT_SESSION *wt_session;
uint64_t insert_count;
int exist;
/*
* Normally, the Bloom URI is populated when the chunk struct is
* allocated. After an open, however, it may not have been.
* Deal with that here.
*/
if (chunk->bloom_uri == NULL) {
WT_CLEAR(buf);
WT_RET(__wt_lsm_tree_bloom_name(
session, lsm_tree, chunk->id, &buf));
chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
}
/*
* Drop the bloom filter first - there may be some content hanging over
* from an aborted merge or checkpoint.
*/
wt_session = &session->iface;
WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
if (exist)
WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));
bloom = NULL;
/*
* This is merge-like activity, and we don't want compacts to give up
* because we are creating a bunch of bloom filters before merging.
*/
++lsm_tree->merge_progressing;
WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
lsm_tree->bloom_config, chunk->count,
lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));
/* Open a special merge cursor just on this chunk. */
WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
F_SET(src, WT_CURSTD_RAW);
WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
F_SET(session, WT_SESSION_NO_CACHE);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
WT_ERR(__wt_bloom_insert(bloom, &key));
}
WT_ERR_NOTFOUND_OK(ret);
WT_TRET(src->close(src));
WT_TRET(__wt_bloom_finalize(bloom));
WT_ERR(ret);
F_CLR(session, WT_SESSION_NO_CACHE);
/* Load the new Bloom filter into cache. */
WT_CLEAR(key);
WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
WT_VERBOSE_ERR(session, lsm,
"LSM worker created bloom filter %s. "
"Expected %" PRIu64 " items, got %" PRIu64,
chunk->bloom_uri, chunk->count, insert_count);
/* Ensure the bloom filter is in the metadata. */
WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
ret = __wt_lsm_meta_write(session, lsm_tree);
++lsm_tree->dsk_gen;
WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
if (ret != 0)
WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");
err: if (bloom != NULL)
WT_TRET(__wt_bloom_close(bloom));
F_CLR(session, WT_SESSION_NO_CACHE);
return (ret);
}
//.........这里部分代码省略.........
uint64_t *switch_txnp;
uint64_t snap_min;
lsm_tree = clsm->lsm_tree;
session = (WT_SESSION_IMPL *)clsm->iface.session;
txn = &session->txn;
/* Merge cursors never update. */
if (F_ISSET(clsm, WT_CLSM_MERGE))
return (0);
if (reset) {
WT_ASSERT(session, !F_ISSET(&clsm->iface,
WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
WT_RET(__clsm_reset_cursors(clsm, NULL));
}
for (;;) {
/*
* If the cursor looks up-to-date, check if the cache is full.
* In case this call blocks, the check will be repeated before
* proceeding.
*/
if (clsm->dsk_gen != lsm_tree->dsk_gen &&
lsm_tree->nchunks != 0)
goto open;
if (clsm->dsk_gen != lsm_tree->dsk_gen &&
lsm_tree->nchunks != 0)
goto open;
/* Update the maximum transaction ID in the primary chunk. */
if (update) {
/*
* Ensure that there is a transaction snapshot active.
*/
WT_RET(__wt_txn_autocommit_check(session));
WT_RET(__wt_txn_id_check(session));
WT_RET(__clsm_enter_update(clsm));
if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
goto open;
if (txn->isolation == WT_ISO_SNAPSHOT)
__wt_txn_cursor_op(session);
/*
* Figure out how many updates are required for
* snapshot isolation.
*
* This is not a normal visibility check on the maximum
* transaction ID in each chunk: any transaction ID
* that overlaps with our snapshot is a potential
* conflict.
*/
clsm->nupdates = 1;
if (txn->isolation == WT_ISO_SNAPSHOT &&
F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
WT_ASSERT(session,
F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
snap_min = txn->snap_min;
for (switch_txnp =
&clsm->switch_txn[clsm->nchunks - 2];
clsm->nupdates < clsm->nchunks;
clsm->nupdates++, switch_txnp--) {
if (WT_TXNID_LT(*switch_txnp, snap_min))
break;
WT_ASSERT(session,
!__wt_txn_visible_all(
session, *switch_txnp));
}
}
}
/*
* Stop when we are up-to-date, as long as this is:
* - a snapshot isolation update and the cursor is set up for
* that;
* - an update operation with a primary chunk, or
* - a read operation and the cursor is open for reading.
*/
if ((!update ||
txn->isolation != WT_ISO_SNAPSHOT ||
F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
((update && clsm->primary_chunk != NULL) ||
(!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
break;
open: WT_WITH_SCHEMA_LOCK(session,
ret = __clsm_open_cursors(clsm, update, 0, 0));
WT_RET(ret);
}
if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
WT_RET(__cursor_enter(session));
F_SET(clsm, WT_CLSM_ACTIVE);
}
return (0);
}
DB *
__rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo,
int dflags)
{
BTREE *t;
BTREEINFO btopeninfo;
DB *dbp;
PAGE *h;
struct stat sb;
int rfd = -1; /* pacify gcc */
int sverrno;
dbp = NULL;
/* Open the user's file -- if this fails, we're done. */
if (fname != NULL) {
if ((rfd = open(fname, flags | O_CLOEXEC, mode)) == -1)
return NULL;
}
/* Create a btree in memory (backed by disk). */
if (openinfo) {
if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT))
goto einval;
btopeninfo.flags = 0;
btopeninfo.cachesize = openinfo->cachesize;
btopeninfo.maxkeypage = 0;
btopeninfo.minkeypage = 0;
btopeninfo.psize = openinfo->psize;
btopeninfo.compare = NULL;
btopeninfo.prefix = NULL;
btopeninfo.lorder = openinfo->lorder;
dbp = __bt_open(openinfo->bfname,
O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags);
} else
dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags);
if (dbp == NULL)
goto err;
/*
* Some fields in the tree structure are recno specific. Fill them
* in and make the btree structure look like a recno structure. We
* don't change the bt_ovflsize value, it's close enough and slightly
* bigger.
*/
t = dbp->internal;
if (openinfo) {
if (openinfo->flags & R_FIXEDLEN) {
F_SET(t, R_FIXLEN);
t->bt_reclen = openinfo->reclen;
if (t->bt_reclen == 0)
goto einval;
}
t->bt_bval = openinfo->bval;
} else
t->bt_bval = '\n';
F_SET(t, R_RECNO);
if (fname == NULL)
F_SET(t, R_EOF | R_INMEM);
else
t->bt_rfd = rfd;
if (fname != NULL) {
/*
* In 4.4BSD, stat(2) returns true for ISSOCK on pipes.
* Unfortunately, that's not portable, so we use lseek
* and check the errno values.
*/
errno = 0;
if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) {
switch (flags & O_ACCMODE) {
case O_RDONLY:
F_SET(t, R_RDONLY);
break;
default:
goto einval;
}
slow: if ((t->bt_rfp = fdopen(rfd, "r")) == NULL)
goto err;
F_SET(t, R_CLOSEFP);
t->bt_irec =
F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe;
} else {
switch (flags & O_ACCMODE) {
case O_RDONLY:
F_SET(t, R_RDONLY);
break;
case O_RDWR:
break;
default:
goto einval;
}
if (fstat(rfd, &sb))
goto err;
/*
* Kluge -- we'd like to test to see if the file is too
* big to mmap. Since, we don't know what size or type
* off_t's or size_t's are, what the largest unsigned
* integral type is, or what random insanity the local
//.........这里部分代码省略.........
//.........这里部分代码省略.........
if (F_ISSET(clp, CL_STDIN_TTY)) {
(void)tcgetattr(STDIN_FILENO, &t);
(void)tcsetattr(STDIN_FILENO,
TCSASOFT | TCSADRAIN, &clp->orig);
}
/* Stop the process group. */
(void)kill(0, SIGTSTP);
/* Time passes ... */
/* Restore terminal settings. */
if (F_ISSET(clp, CL_STDIN_TTY))
(void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t);
return (0);
}
/*
* Move to the lower left-hand corner of the screen.
*
* XXX
* Not sure this is necessary in System V implementations, but it
* shouldn't hurt.
*/
getyx(win, y, x);
(void)wmove(win, LINES - 1, 0);
(void)wrefresh(win);
/*
* Temporarily end the screen. System V introduced a semantic where
* endwin() could be restarted. We use it because restarting curses
* from scratch often fails in System V. 4BSD curses didn't support
* restarting after endwin(), so we have to do what clean up we can
* without calling it.
*/
/* Save the terminal settings. */
(void)tcgetattr(STDIN_FILENO, &t);
/* Restore the cursor keys to normal mode. */
(void)keypad(stdscr, FALSE);
/* Restore the window name. */
(void)cl_rename(sp, NULL, 0);
#ifdef HAVE_BSD_CURSES
(void)cl_attr(sp, SA_ALTERNATE, 0);
#else
(void)endwin();
#endif
/*
* XXX
* Restore the original terminal settings. This is bad -- the
* reset can cause character loss from the tty queue. However,
* we can't call endwin() in BSD curses implementations, and too
* many System V curses implementations don't get it right.
*/
(void)tcsetattr(STDIN_FILENO, TCSADRAIN | TCSASOFT, &clp->orig);
/* Stop the process group. */
(void)kill(0, SIGTSTP);
/* Time passes ... */
/*
* If we received a killer signal, we're done. Leave everything
* unchanged. In addition, the terminal has already been reset
* correctly, so leave it alone.
*/
if (clp->killersig) {
F_CLR(clp, CL_SCR_EX_INIT | CL_SCR_VI_INIT);
return (0);
}
/* Restore terminal settings. */
wrefresh(win); /* Needed on SunOs/Solaris ? */
if (F_ISSET(clp, CL_STDIN_TTY))
(void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t);
#ifdef HAVE_BSD_CURSES
(void)cl_attr(sp, SA_ALTERNATE, 1);
#endif
/* Set the window name. */
(void)cl_rename(sp, sp->frp->name, 1);
/* Put the cursor keys into application mode. */
(void)keypad(stdscr, TRUE);
/* Refresh and repaint the screen. */
(void)wmove(win, y, x);
(void)cl_refresh(sp, 1);
/* If the screen changed size, set the SIGWINCH bit. */
if (cl_ssize(sp, 1, NULL, NULL, &changed))
return (1);
if (changed)
F_SET(CLP(sp), CL_SIGWINCH);
return (0);
}
开发者ID:fishman,项目名称:nvi,代码行数:101,代码来源:cl_funcs.c
示例11: __clsm_open_cursors
/*
* __clsm_open_cursors --
* Open cursors for the current set of files.
*/
static int
__clsm_open_cursors(
WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
{
WT_BTREE *btree;
WT_CURSOR *c, **cp, *primary;
WT_DECL_RET;
WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
WT_SESSION_IMPL *session;
WT_TXN *txn;
const char *checkpoint, *ckpt_cfg[3];
uint64_t saved_gen;
u_int i, nchunks, ngood, nupdates;
u_int close_range_end, close_range_start;
bool locked;
c = &clsm->iface;
session = (WT_SESSION_IMPL *)c->session;
txn = &session->txn;
chunk = NULL;
locked = false;
lsm_tree = clsm->lsm_tree;
/*
* Ensure that any snapshot update has cursors on the right set of
* chunks to guarantee visibility is correct.
*/
if (update && txn->isolation == WT_ISO_SNAPSHOT)
F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
/*
* Query operations need a full set of cursors. Overwrite cursors
* do queries in service of updates.
*/
if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
F_SET(clsm, WT_CLSM_OPEN_READ);
if (lsm_tree->nchunks == 0)
return (0);
ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
ckpt_cfg[2] = NULL;
/*
* If the key is pointing to memory that is pinned by a chunk
* cursor, take a copy before closing cursors.
*/
if (F_ISSET(c, WT_CURSTD_KEY_INT))
WT_CURSOR_NEEDKEY(c);
F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
locked = true;
/* Merge cursors have already figured out how many chunks they need. */
retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
nchunks = clsm->nchunks;
ngood = 0;
/*
* We may have raced with another merge completing. Check that
* we're starting at the right offset in the chunk array.
*/
if (start_chunk >= lsm_tree->nchunks ||
lsm_tree->chunk[start_chunk]->id != start_id) {
for (start_chunk = 0;
start_chunk < lsm_tree->nchunks;
start_chunk++) {
chunk = lsm_tree->chunk[start_chunk];
if (chunk->id == start_id)
break;
}
/* We have to find the start chunk: merge locked it. */
WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
}
WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
} else {
nchunks = lsm_tree->nchunks;
/*
* If we are only opening the cursor for updates, only open the
* primary chunk, plus any other chunks that might be required
* to detect snapshot isolation conflicts.
*/
if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
WT_ERR(__wt_realloc_def(session,
&clsm->txnid_alloc, nchunks,
&clsm->switch_txn));
if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
ngood = nupdates = 0;
else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
/*
//.........这里部分代码省略.........
/*
* __wt_evict_file --
* Discard pages for a specific file.
*/
int
__wt_evict_file(WT_SESSION_IMPL *session, int syncop)
{
WT_DECL_RET;
WT_PAGE *page;
WT_REF *next_ref, *ref;
int evict_reset;
/*
* We need exclusive access to the file -- disable ordinary eviction
* and drain any blocks already queued.
*/
WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
/* Make sure the oldest transaction ID is up-to-date. */
__wt_txn_update_oldest(session, 1);
/* Walk the tree, discarding pages. */
next_ref = NULL;
WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
WT_READ_CACHE | WT_READ_NO_EVICT));
while ((ref = next_ref) != NULL) {
page = ref->page;
/*
* Eviction can fail when a page in the evicted page's subtree
* switches state. For example, if we don't evict a page marked
* empty, because we expect it to be merged into its parent, it
* might no longer be empty after it's reconciled, in which case
* eviction of its parent would fail. We can either walk the
* tree multiple times (until it's finally empty), or reconcile
* each page to get it to its final state before considering if
* it's an eviction target or will be merged into its parent.
*
* Don't limit this test to any particular page type, that tends
* to introduce bugs when the reconciliation of other page types
* changes, and there's no advantage to doing so.
*
* Eviction can also fail because an update cannot be written.
* If sessions have disjoint sets of files open, updates in a
* no-longer-referenced file may not yet be globally visible,
* and the write will fail with EBUSY. Our caller handles that
* error, retrying later.
*/
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));
/*
* We can't evict the page just returned to us (it marks our
* place in the tree), so move the walk to one page ahead of
* the page being evicted. Note, we reconciled the returned
* page first: if reconciliation of that page were to change
* the shape of the tree, and we did the next walk call before
* the reconciliation, the next walk call could miss a page in
* the tree.
*/
WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
WT_READ_CACHE | WT_READ_NO_EVICT));
switch (syncop) {
case WT_SYNC_CLOSE:
/*
* Evict the page.
*/
WT_ERR(__wt_evict(session, ref, WT_EVICT_EXCLUSIVE));
break;
case WT_SYNC_DISCARD:
WT_ASSERT(session,
__wt_page_can_evict(session, page, 0, NULL));
__wt_evict_page_clean_update(session, ref);
break;
case WT_SYNC_DISCARD_FORCE:
/*
* Forced discard of the page, whether clean or dirty.
* If we see a dirty page in a forced discard, clean
* the page, both to keep statistics correct, and to
* let the page-discard function assert no dirty page
* is ever discarded.
*/
if (__wt_page_is_modified(page)) {
page->modify->write_gen = 0;
__wt_cache_dirty_decr(session, page);
}
F_SET(session, WT_SESSION_DISCARD_FORCE);
__wt_evict_page_clean_update(session, ref);
F_CLR(session, WT_SESSION_DISCARD_FORCE);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
}
if (0) {
err: /* On error, clear any left-over tree walk. */
if (next_ref != NULL)
WT_TRET(__wt_page_release(
//.........这里部分代码省略.........
/*
* __wt_btcur_prev --
* Move to the previous record in the tree.
*/
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
{
WT_DECL_RET;
WT_PAGE *page;
WT_SESSION_IMPL *session;
uint32_t flags;
int skipped, newpage;
session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_STAT_FAST_CONN_INCR(session, cursor_prev);
WT_STAT_FAST_DATA_INCR(session, cursor_prev);
flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */
if (truncating)
LF_SET(WT_READ_TRUNCATE);
WT_RET(__cursor_func_init(cbt, 0));
/*
* If we aren't already iterating in the right direction, there's
* some setup to do.
*/
if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
__wt_btcur_iterate_setup(cbt, 0);
/*
* Walk any page we're holding until the underlying call returns not-
* found. Then, move to the previous page, until we reach the start
* of the file.
*/
for (skipped = newpage = 0;; skipped = 0, newpage = 1) {
page = cbt->ref == NULL ? NULL : cbt->ref->page;
WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));
/*
* The last page in a column-store has appended entries.
* We handle it separately from the usual cursor code:
* it's only that one page and it's in a simple format.
*/
if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
(cbt->ins_head = WT_COL_APPEND(page)) != NULL)
F_SET(cbt, WT_CBT_ITERATE_APPEND);
if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
switch (page->type) {
case WT_PAGE_COL_FIX:
ret = __cursor_fix_append_prev(cbt, newpage);
break;
case WT_PAGE_COL_VAR:
ret = __cursor_var_append_prev(
cbt, newpage, &skipped);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
if (ret == 0)
break;
F_CLR(cbt, WT_CBT_ITERATE_APPEND);
if (ret != WT_NOTFOUND)
break;
newpage = 1;
}
if (page != NULL) {
switch (page->type) {
case WT_PAGE_COL_FIX:
ret = __cursor_fix_prev(cbt, newpage);
break;
case WT_PAGE_COL_VAR:
ret = __cursor_var_prev(cbt, newpage, &skipped);
break;
case WT_PAGE_ROW_LEAF:
ret = __cursor_row_prev(cbt, newpage, &skipped);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
if (ret != WT_NOTFOUND)
break;
}
if (newpage && skipped)
page->read_gen = WT_READGEN_OLDEST;
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
}
err: if (ret != 0)
WT_TRET(__cursor_reset(cbt));
return (ret);
}
//.........这里部分代码省略.........
db_append(sp, 1, lno++, NULL, 0))
return (1);
}
}
/*
* If there's any remaining text, we're in a global, and
* there's more command to parse.
*
* !!!
* We depend on the fact that non-global commands will eat the
* rest of the command line as text input, and before getting
* any text input from the user. Otherwise, we'd have to save
* off the command text before or during the call to the text
* input function below.
*/
if (len != 0)
cmdp->save_cmd = t;
cmdp->save_cmdlen = len;
}
if (F_ISSET(sp, SC_EX_GLOBAL)) {
if ((sp->lno = lno) == 0 && db_exist(sp, 1))
sp->lno = 1;
return (0);
}
/*
* If not in a global command, read from the terminal.
*
* If this code is called by vi, we want to reset the terminal and use
* ex's line get routine. It actually works fine if we use vi's get
* routine, but it doesn't look as nice. Maybe if we had a separate
* window or something, but getting a line at a time looks awkward.
* However, depending on the screen that we're using, that may not
* be possible.
*/
if (F_ISSET(sp, SC_VI)) {
if (gp->scr_screen(sp, SC_EX)) {
ex_wemsg(sp, cmdp->cmd->name, EXM_NOCANON);
return (1);
}
/* If we're still in the vi screen, move out explicitly. */
need_newline = !F_ISSET(sp, SC_SCR_EXWROTE);
F_SET(sp, SC_SCR_EX | SC_SCR_EXWROTE);
if (need_newline)
(void)ex_puts(sp, "\n");
/*
* !!!
* Users of historical versions of vi sometimes get confused
* when they enter append mode, and can't seem to get out of
* it. Give them an informational message.
*/
(void)ex_puts(sp,
msg_cat(sp, "273|Entering ex input mode.", NULL));
(void)ex_puts(sp, "\n");
(void)ex_fflush(sp);
}
/*
* Set input flags; the ! flag turns off autoindent for append,
* change and insert.
*/
LF_INIT(TXT_DOTTERM | TXT_NUMBER);
if (!FL_ISSET(cmdp->iflags, E_C_FORCE) && O_ISSET(sp, O_AUTOINDENT))
LF_SET(TXT_AUTOINDENT);
if (O_ISSET(sp, O_BEAUTIFY))
LF_SET(TXT_BEAUTIFY);
/*
* This code can't use the common screen TEXTH structure (sp->tiq),
* as it may already be in use, e.g. ":append|s/abc/ABC/" would fail
* as we are only halfway through the text when the append code fires.
* Use a local structure instead. (The ex code would have to use a
* local structure except that we're guaranteed to finish remaining
* characters in the common TEXTH structure when they were inserted
* into the file, above.)
*/
TAILQ_INIT(tiq);
if (ex_txt(sp, tiq, 0, flags))
return (1);
TAILQ_FOREACH(tp, tiq, q) {
if (db_append(sp, 1, lno++, tp->lb, tp->len))
return (1);
++cnt;
}
/*
* Set sp->lno to the final line number value (correcting for a
* possible 0 value) as that's historically correct for the final
* line value, whether or not the user entered any text.
*/
if ((sp->lno = lno) == 0 && db_exist(sp, 1))
sp->lno = 1;
return (0);
}
请发表评论