PostgreSQL在何处处理 sql查询之十七

继续：

/*
 * estimate_rel_size - estimate # pages and # tuples in a table or index
 *
 * We also estimate the fraction of the pages that are marked all-visible in
 * the visibility map, for use in estimation of index-only scans.
 *
 * If attr_widths isn't NULL, it points to the zero-index entry of the
 * relation's attr_widths[] cache; we fill this in if we have need to compute
 * the attribute widths for estimation purposes.
 */
void
estimate_rel_size(Relation rel, int32 *attr_widths,
                  BlockNumber *pages, double *tuples, double *allvisfrac)
{
    ...
    switch (rel->rd_rel->relkind)
    {
        case RELKIND_RELATION:
        case RELKIND_INDEX:
        case RELKIND_TOASTVALUE:
            /* it has storage, ok to call the smgr */
            curpages = RelationGetNumberOfBlocks(rel);
            ...
            break;
        case RELKIND_SEQUENCE:
            ...
            break;
        case RELKIND_FOREIGN_TABLE:
            ...
            break;
        default:
            ...
            break;
    }
}

首先要判断此表有多少个块： RelationGetNumberOfBlocks

/*
 * The physical storage of a relation consists of one or more forks. The
 * main fork is always created, but in addition to that there can be
 * additional forks for storing various metadata. ForkNumber is used when
 * we need to refer to a specific fork in a relation.
 */
typedef enum ForkNumber
{
    InvalidForkNumber = -1,
    MAIN_FORKNUM = 0,
    FSM_FORKNUM,
    VISIBILITYMAP_FORKNUM,
    INIT_FORKNUM

    /*
     * NOTE: if you add a new fork, change MAX_FORKNUM below and update the
     * forkNames array in catalog.c
     */
} ForkNumber;

再看：

#define RelationGetNumberOfBlocks(reln) \
    RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM)

再看：

/*
 * RelationGetNumberOfBlocks
 *        Determines the current number of pages in the relation.
 */
BlockNumber
RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
{
    /* Open it at the smgr level if not already done */
    RelationOpenSmgr(relation);

    return smgrnblocks(relation->rd_smgr, forkNum);
}

再看：

数据库表对应的文件发生问题时，smgrnblocks 函数会发生错误：

/*
 *    smgrnblocks() -- Calculate the number of blocks in the
 *                     supplied relation.
 */
BlockNumber
smgrnblocks(SMgrRelation reln, ForkNumber forknum)
{
    return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln, forknum);
}

此处，使用了函数指针，经过一番跟踪，发现当我第一次执行如 select * from tab01 命令时，会执行到：

/*
 *    mdnblocks() -- Get the number of blocks stored in a relation.
 *
 *        Important side effect: all active segments of the relation are opened
 *        and added to the mdfd_chain list.  If this routine has not been
 *        called, then only segments up to the last one actually touched
 *        are present in the chain.
 */
BlockNumber
mdnblocks(SMgrRelation reln, ForkNumber forknum)
{

    MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
    BlockNumber nblocks;
    BlockNumber segno = 0;

    /*
     * Skip through any segments that aren't the last one, to avoid redundant
     * seeks on them.  We have previously verified that these segments are
     * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
     *
     * NOTE: this assumption could only be wrong if another backend has
     * truncated the relation.    We rely on higher code levels to handle that
     * scenario by closing and re-opening the md fd, which is handled via
     * relcache flush.    (Since the checkpointer doesn't participate in
     * relcache flush, it could have segment chain entries for inactive
     * segments; that's OK because the checkpointer never needs to compute
     * relation size.)
     */
    while (v->mdfd_chain != NULL)
    {
        segno++;
        v = v->mdfd_chain;
    }

    for (;;)
    {
        nblocks = _mdnblocks(reln, forknum, v);
        if (nblocks > ((BlockNumber) RELSEG_SIZE))
            elog(FATAL, "segment too big");
        if (nblocks < ((BlockNumber) RELSEG_SIZE))
            return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;

        /*
         * If segment is exactly RELSEG_SIZE, advance to next one.
         */
        segno++;

        if (v->mdfd_chain == NULL)
        {
            /*
             * Because we pass O_CREAT, we will create the next segment (with
             * zero length) immediately, if the last segment is of length
             * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
             * the logic simple.
             */
            v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
            if (v->mdfd_chain == NULL)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not open file \"%s\": %m",
                                _mdfd_segpath(reln, forknum, segno))));
        }

        v = v->mdfd_chain;
    }
}

下一步看 mdopen函数

/*
 *    mdopen() -- Open the specified relation.
 *
 * Note we only open the first segment, when there are multiple segments.
 *
 * If first segment is not present, either ereport or return NULL according
 * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 * invent one out of whole cloth.
 */
static MdfdVec *
mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
{
    ...

    path = relpath(reln->smgr_rnode, forknum);

    fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);

if (fd < 0)
    {

        fprintf(stderr,"In %s----%d\n",__FUNCTION__, __LINE__);

        /*
         * During bootstrap, there are cases where a system relation will be
         * accessed (by internal backend processes) before the bootstrap
         * script nominally creates it.  Therefore, accept mdopen() as a
         * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
         */
        if (IsBootstrapProcessingMode())
            fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);

        fprintf(stderr,"In %s----%d\n",__FUNCTION__, __LINE__);

        if (fd < 0)
        {
            if (behavior == EXTENSION_RETURN_NULL &&
                FILE_POSSIBLY_DELETED(errno))
            {
                pfree(path);
                return NULL;
            }
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\": %m", path)));
        }
    }

    ...return mdfd;
}

再看 PathNameOpenFile，如果打开文件失败，就会返回-1。

/*
 * open a file in an arbitrary directory
 *
 * NB: if the passed pathname is relative (which it usually is),
 * it will be interpreted relative to the process' working directory
 * (which should always be $PGDATA when this code is running).
 */
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
    char       *fnamecopy;
    File        file;
    Vfd           *vfdP;

    DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
               fileName, fileFlags, fileMode));

    /*
     * We need a malloc'd copy of the file name; fail cleanly if no room.
     */
    fnamecopy = strdup(fileName);
    if (fnamecopy == NULL)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    file = AllocateVfd();
    vfdP = &VfdCache[file];

    while (nfile + numAllocatedDescs >= max_safe_fds)
    {
        if (!ReleaseLruFile())
            break;
    }

    vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);

    if (vfdP->fd < 0)
    {
        FreeVfd(file);
        free(fnamecopy);
return -1;
    }
    ++nfile;
    DO_DB(elog(LOG, "PathNameOpenFile: success %d",
               vfdP->fd));

    Insert(file);

    vfdP->fileName = fnamecopy;
    /* Saved flags are adjusted to be OK for re-opening file */
    vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    vfdP->fileMode = fileMode;
    vfdP->seekPos = 0;
    vfdP->fileSize = 0;
    vfdP->fdstate = 0x0;
    vfdP->resowner = NULL;

    return file;
}