PostgreSQL在何处处理 sql查询之三十七

看：

TupleTableSlot *
ExecProcNode(PlanState *node)
{
    TupleTableSlot *result;

    CHECK_FOR_INTERRUPTS();

    if (node->chgParam != NULL) /* something changed */
        ExecReScan(node);        /* let ReScan handle this */

    if (node->instrument)
        InstrStartNode(node->instrument);

    switch (nodeTag(node))
    {
            /*
             * control nodes
             */
        case T_ResultState:
            fprintf(stderr,"T_ResultState\n");
            result = ExecResult((ResultState *) node);
            break;

        case T_ModifyTableState:
            fprintf(stderr,"T_ModifyTableState\n");
            result = ExecModifyTable((ModifyTableState *) node);
            break;

        case T_AppendState:
            fprintf(stderr,"T_AppendState\n");
            result = ExecAppend((AppendState *) node);
            break;

        case T_MergeAppendState:
            fprintf(stderr,"T_MergeAppendState\n");
            result = ExecMergeAppend((MergeAppendState *) node);
            break;

        case T_RecursiveUnionState:
            fprintf(stderr,"T_RecursiveUnionState\n");
            result = ExecRecursiveUnion((RecursiveUnionState *) node);
            break;

            /* BitmapAndState does not yield tuples */

            /* BitmapOrState does not yield tuples */

            /*
             * scan nodes
             */
        case T_SeqScanState:
            fprintf(stderr,"T_SeqScanState\n");
            result = ExecSeqScan((SeqScanState *) node);
            break;

        case T_IndexScanState:
            fprintf(stderr,"T_IndexScanState\n");
            result = ExecIndexScan((IndexScanState *) node);
            break;

        case T_IndexOnlyScanState:
            fprintf(stderr,"T_IndexOnlyScanState\n");
            result = ExecIndexOnlyScan((IndexOnlyScanState *) node);
            break;

            /* BitmapIndexScanState does not yield tuples */

        case T_BitmapHeapScanState:
            fprintf(stderr,"T_BitmapHeapScanState\n");
            result = ExecBitmapHeapScan((BitmapHeapScanState *) node);
            break;

        case T_TidScanState:
            fprintf(stderr,"T_TidScanState\n");
            result = ExecTidScan((TidScanState *) node);
            break;

        case T_SubqueryScanState:
            fprintf(stderr,"T_SubqueryScanState\n");
            result = ExecSubqueryScan((SubqueryScanState *) node);
            break;

        case T_FunctionScanState:
            fprintf(stderr,"T_FunctionScanState\n");
            result = ExecFunctionScan((FunctionScanState *) node);
            break;

        case T_ValuesScanState:
            fprintf(stderr,"T_ValuesScanState\n");
            result = ExecValuesScan((ValuesScanState *) node);
            break;

        case T_CteScanState:
            fprintf(stderr,"T_CteScanState\n");
            result = ExecCteScan((CteScanState *) node);
            break;

        case T_WorkTableScanState:
            fprintf(stderr,"T_WorkTableScanState\n");
            result = ExecWorkTableScan((WorkTableScanState *) node);
            break;

        case T_ForeignScanState:
            fprintf(stderr,"T_ForeignScanState\n");
            result = ExecForeignScan((ForeignScanState *) node);
            break;

            /*
             * join nodes
             */
        case T_NestLoopState:
            fprintf(stderr,"T_NestLoopState\n");
            result = ExecNestLoop((NestLoopState *) node);
            break;

        case T_MergeJoinState:
            fprintf(stderr,"T_MergeJoinState\n");
            result = ExecMergeJoin((MergeJoinState *) node);
            break;

        case T_HashJoinState:
            fprintf(stderr,"T_HashJoinState\n");
            result = ExecHashJoin((HashJoinState *) node);
            break;

            /*
             * materialization nodes
             */
        case T_MaterialState:
            fprintf(stderr,"T_MaterialState\n");
            result = ExecMaterial((MaterialState *) node);
            break;

        case T_SortState:
            fprintf(stderr,"T_SortState\n");
            result = ExecSort((SortState *) node);
            break;

        case T_GroupState:
            fprintf(stderr,"T_GroupState\n");
            result = ExecGroup((GroupState *) node);
            break;

        case T_AggState:
            fprintf(stderr,"T_AggState\n");
            result = ExecAgg((AggState *) node);
            break;

        case T_WindowAggState:
            fprintf(stderr,"T_WindowAggState\n");
            result = ExecWindowAgg((WindowAggState *) node);
            break;

        case T_UniqueState:
            fprintf(stderr,"T_UniqueState\n");
            result = ExecUnique((UniqueState *) node);
            break;

        case T_HashState:
            fprintf(stderr,"T_HashState\n");
            result = ExecHash((HashState *) node);
            break;

        case T_SetOpState:
            fprintf(stderr,"T_SetOpState\n");
            result = ExecSetOp((SetOpState *) node);
            break;

        case T_LockRowsState:
            fprintf(stderr,"T_LockRowsState\n");
            result = ExecLockRows((LockRowsState *) node);
            break;

        case T_LimitState:
            fprintf(stderr,"T_LimitState\n");
            result = ExecLimit((LimitState *) node);
            break;

        default:
            elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
            result = NULL;
            break;
    }

    if (node->instrument)
        InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);

    return result;
}

实际执行发现：我的select id from tst04 where id<3 ，执行了三次 T_SeqScanState。

将上述代码缩略：

TupleTableSlot *
ExecProcNode(PlanState *node)
{
    TupleTableSlot *result;

    CHECK_FOR_INTERRUPTS();

    if (node->chgParam != NULL) /* something changed */
        ExecReScan(node);        /* let ReScan handle this */

    if (node->instrument)
        InstrStartNode(node->instrument);

    switch (nodeTag(node))
    {
       ...
            /*
             * scan nodes
             */
        case T_SeqScanState:
            fprintf(stderr,"T_SeqScanState\n");
            result = ExecSeqScan((SeqScanState *) node);
            break;

        ...
    }

    if (node->instrument)
        InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);

    return result;
}

再看 ExecSeaScan 的执行：

/* ----------------------------------------------------------------
 *        ExecSeqScan(node)
 *
 *        Scans the relation sequentially and returns the next qualifying
 *        tuple.
 *        We call the ExecScan() routine and pass it the appropriate
 *        access method functions.
 * ----------------------------------------------------------------
 */
TupleTableSlot *
ExecSeqScan(SeqScanState *node)
{
    return ExecScan((ScanState *) node,
                    (ExecScanAccessMtd) SeqNext,
                    (ExecScanRecheckMtd) SeqRecheck);
}

这个就是全表扫描中的一步。

/* ----------------------------------------------------------------
 *        ExecScan
 *
 *        Scans the relation using the 'access method' indicated and
 *        returns the next qualifying tuple in the direction specified
 *        in the global variable ExecDirection.
 *        The access method returns the next tuple and execScan() is
 *        responsible for checking the tuple returned against the qual-clause.
 *
 *        A 'recheck method' must also be provided that can check an
 *        arbitrary tuple of the relation against any qual conditions
 *        that are implemented internal to the access method.
 *
 *        Conditions:
 *          -- the "cursor" maintained by the AMI is positioned at the tuple
 *             returned previously.
 *
 *        Initial States:
 *          -- the relation indicated is opened for scanning so that the
 *             "cursor" is positioned before the first qualifying tuple.
 * ----------------------------------------------------------------
 */
TupleTableSlot *
ExecScan(ScanState *node,
         ExecScanAccessMtd accessMtd,    /* function returning a tuple */
         ExecScanRecheckMtd recheckMtd)
{
    ExprContext *econtext;
    List       *qual;
    ProjectionInfo *projInfo;
    ExprDoneCond isDone;
    TupleTableSlot *resultSlot;

    /*
     * Fetch data from node
     */
    qual = node->ps.qual;
    projInfo = node->ps.ps_ProjInfo;
    econtext = node->ps.ps_ExprContext;

    /*
     * If we have neither a qual to check nor a projection to do, just skip
     * all the overhead and return the raw scan tuple.
     */
    if (!qual && !projInfo)
    {
        ResetExprContext(econtext);
        return ExecScanFetch(node, accessMtd, recheckMtd);
    }

    /*
     * Check to see if we're still projecting out tuples from a previous scan
     * tuple (because there is a function-returning-set in the projection
     * expressions).  If so, try to project another one.
     */
    if (node->ps.ps_TupFromTlist)
    {
        Assert(projInfo);        /* can't get here if not projecting */
        resultSlot = ExecProject(projInfo, &isDone);
        if (isDone == ExprMultipleResult)
            return resultSlot;
        /* Done with that source tuple... */
        node->ps.ps_TupFromTlist = false;
    }

    /*
     * Reset per-tuple memory context to free any expression evaluation
     * storage allocated in the previous tuple cycle.  Note this can't happen
     * until we're done projecting out tuples from a scan tuple.
     */
    ResetExprContext(econtext);

    /*
     * get a tuple from the access method.    Loop until we obtain a tuple that
     * passes the qualification.
     */
    for (;;)
    {
        TupleTableSlot *slot;

        CHECK_FOR_INTERRUPTS();

        slot = ExecScanFetch(node, accessMtd, recheckMtd);

        /*
         * if the slot returned by the accessMtd contains NULL, then it means
         * there is nothing more to scan so we just return an empty slot,
         * being careful to use the projection result slot so it has correct
         * tupleDesc.
         */
        if (TupIsNull(slot))
        {
            if (projInfo)
                return ExecClearTuple(projInfo->pi_slot);
            else
                return slot;
        }

        /*
         * place the current tuple into the expr context
         */
        econtext->ecxt_scantuple = slot;

        /*
         * check that the current tuple satisfies the qual-clause
         *
         * check for non-nil qual here to avoid a function call to ExecQual()
         * when the qual is nil ... saves only a few cycles, but they add up
         * ...
         */
        if (!qual || ExecQual(qual, econtext, false))
        {
            /*
             * Found a satisfactory scan tuple.
             */
            if (projInfo)
            {
                /*
                 * Form a projection tuple, store it in the result tuple slot
                 * and return it --- unless we find we can project no tuples
                 * from this scan tuple, in which case continue scan.
                 */
                resultSlot = ExecProject(projInfo, &isDone);
                if (isDone != ExprEndResult)
                {
                    node->ps.ps_TupFromTlist = (isDone == ExprMultipleResult);
                    return resultSlot;
                }
            }
            else
            {
                /*
                 * Here, we aren't projecting, so just return scan tuple.
                 */
                return slot;
            }
        }
        else
            InstrCountFiltered1(node, 1);

        /*
         * Tuple fails qual, so free per-tuple memory and try again.
         */
        ResetExprContext(econtext);
    }
}

现在抛开细节不谈，谈谈自己的观感：

梳理一下：

对于一个特定的查询而言，一旦执行计划指定了seqscan，那么接下来：

ExecutePlan --> ExecProcNode --> ExecSeqScan-->ExecScan

static void
ExecutePlan(EState *estate,
            PlanState *planstate,
            CmdType operation,
            bool sendTuples,
            long numberTuples,
            ScanDirection direction,
            DestReceiver *dest)
{
    ...
    /*
     * Loop until we've processed the proper number of tuples from the plan.
     */
    for (;;)
    {
        ...
        /*
         * Execute the plan and obtain a tuple
         */
        slot = ExecProcNode(planstate);
        ...

        /*
         * if the tuple is null, then we assume there is nothing more to
         * process so we just end the loop...
         */
        if (TupIsNull(slot))
            break;
        ...

        /*
         * check our tuple count.. if we've processed the proper number then
         * quit, else loop again and process more tuples.  Zero numberTuples
         * means no limit.
         */
        current_tuple_count++;

        if (numberTuples && numberTuples == current_tuple_count)
            break;
    }
}

ExecProcNode:

/* ----------------------------------------------------------------
 *        ExecProcNode
 *
 *        Execute the given node to return a(nother) tuple.
 * ----------------------------------------------------------------
 */
TupleTableSlot *
ExecProcNode(PlanState *node)
{
    ...

    switch (nodeTag(node))
    {
            /*
             * control nodes
             */
        case T_ResultState:
            
            result = ExecResult((ResultState *) node);
            break;
        ...
            /*
             * scan nodes
             */
        case T_SeqScanState:
            //fprintf(stderr,"T_SeqScanState\n");
            result = ExecSeqScan((SeqScanState *) node);
            break;

        case T_IndexScanState:
            fprintf(stderr,"T_IndexScanState\n");
            result = ExecIndexScan((IndexScanState *) node);
            break;
        ...
    }
    ...
return result;
}

ExecSeqScan:

TupleTableSlot *
ExecSeqScan(SeqScanState *node)
{
    return ExecScan((ScanState *) node,
                    (ExecScanAccessMtd) SeqNext,
                    (ExecScanRecheckMtd) SeqRecheck);
}

ExecScan:

/* ----------------------------------------------------------------
 *        ExecScan
 *
 *        Scans the relation using the 'access method' indicated and
 *        returns the next qualifying tuple in the direction specified
 *        in the global variable ExecDirection.
 *        The access method returns the next tuple and execScan() is
 *        responsible for checking the tuple returned against the qual-clause.
 *
 *        A 'recheck method' must also be provided that can check an
 *        arbitrary tuple of the relation against any qual conditions
 *        that are implemented internal to the access method.
 *
 *        Conditions:
 *          -- the "cursor" maintained by the AMI is positioned at the tuple
 *             returned previously.
 *
 *        Initial States:
 *          -- the relation indicated is opened for scanning so that the
 *             "cursor" is positioned before the first qualifying tuple.
 * ----------------------------------------------------------------
 */
TupleTableSlot *
ExecScan(ScanState *node,
         ExecScanAccessMtd accessMtd,    /* function returning a tuple */
         ExecScanRecheckMtd recheckMtd)
{
    ...

    /*
     * get a tuple from the access method.    Loop until we obtain a tuple that
     * passes the qualification.
     */
    for (;;)
    {
        ...

        slot = ExecScanFetch(node, accessMtd, recheckMtd);

        ...
    }
}

可以看到，全表扫描的时候，是一条条地往下捋，完全没有任何并发的可能。

所以，可以想象，遇到大表，Index立马变得十分重要。如果遇到大表和大表 Hash Join 之类的，那就自求多福吧。

我现在看到的是，加入调试信息后，对于如下的表：

postgres=# select * from tst04;
 id | val 
----+-----
  4 | 400
  1 | 100
  3 | 300
  2 | 200
(4 rows)

然后这样来查询：

postgres=# select * from tst04 where id>1;
 id | val 
----+-----
  4 | 400
  3 | 300
  2 | 200
(3 rows)

可以看到执行情况：

In Standard_ExecutorRun, count is 0
In ExecutePlan ... for loop by process 7365
In ExecScan... for loop ... by process 7365
In ExecutePlan ... for loop by process 7365
In ExecScan... for loop ... by process 7365
In ExecScan... for loop ... by process 7365
In ExecutePlan ... for loop by process 7365
In ExecScan... for loop ... by process 7365
In ExecutePlan ... for loop by process 7365
In ExecScan... for loop ... by process 7365

对为何执行这么多次，还不是很理解。