SQL版本的Dijkstra最短路径算法

受这篇文章《SQL,NoSQL以及数据库的实质》结尾处题目的启发，我尝试写了一个SQL版本的Dijkstra最短路径算法。算法描述如下：

前提假设:

Hive支持Stored Procedure

或者

Mysql支持Insert into、insert overwrite、create table as select操作

数据结构:

//各个节点之间的距离

Table meta_distances(

src int,

dst int,

distance int

)

//已知最短距离的点（及距离）

Table known_nodes(

node int,

distance int

)

//未知最短距离的点（及暂时的距离）

Table unknown_nodes(

node int,

distance int

)

求节点1到其他所有节点的距离

初始，各表包含的数据:

meta_distances:

各个节点之间的距离

known_nodes:

(1,0)

unknown_nodes:

(2, MaxInt)

(3, MaxInt)

(4, MaxInt)

…...

(n, MaxInt)

pivot_node=1

Create Procedure Dijkstra(IN pivot_node int)
Begin
          declare unknown_nodes_count int default 1
          While unknown_nodes_count>0 do
               declare min_distance int default 0
　　　　　　　　　
　　　　　　　　　select 
　　　　　　　　　　　　distance into min_distance
　　　　　　　　  where 
　　　　　　　　　　　　node=pivot_node
               
　　　　　　　　　drop table if exists tmp_distance_a
               //计算unknown_nodes中每一个node若经过pivot_node，与源点的距离
               create table tmp_distance_a
               as
               select
                    f1.dst,
                    distance+ min_distancefrom
                    (select
                         distinct
                         meta_distances.*
                    from
                         meta_distances as f1,
                         unknown_nodes as f2
                    where
                         f1.dst=f2.node
                         and
                         f1.src=pivot_node
                    )as tmp
                    
              //更新unknown_nodes的距离信息
               insert overwrite table unknown_nodes
               select
                    f2.node
                    IF(f1.node is null,
                         f2.distance,
                         IF(f1.distance>f2.distance,
                              f2.distance,
                              f1.distance
                         )
                    )
               from
                    tmp_distance_a as f1
               right outer join
                    unknown_nodes as f2
               on
                    f1.dst=f2.node

               //挑选出最小的node，放入known_nodes中
               insert into table known_nodes
               select
                    node,
                    distance
               from
                    unknown_nodes
               where
                    distance=min(distance)
              
               //挑选出最小的node，最为下一个pivot_node
               select node into pivot_node
               from
                    unknown_nodes
               where
                    distance=min(distance)

               //从unknown_nodes中删除最小node
　　　　　　　　　insert overwrite into unknown_nodes
　　　　　　　　　select *
　　　　　　　　　from unkown_nodes
　　　　　　　　　where distance!=min(distance)

　　　　　　　　　//计算unknown_nodes中剩余node的数量
               select count(*) into unknown_nodes_count
               from
                    unknown_nodes

          End While
     End

java版本

public class DijkstraSample {
    public static void compute(int pivotNode){
        int unknownNodesCount=1;
        while(true){
            if(hive.get("select count(*) from unknown_nodes")<=0){
                //所有点的最短距离都已经计算出
                break;
            }
            //计算unknown_nodes中每一个node若经过pivot_node，与源点的距离
            hive.execute(
                    "drop table if exists tmp_distance_a",
                    
                    "create table tmp_distance_a"+
                   "as"+
                   "select"+
                        "f1.dst,"+
                        "distance+(select distance from known_nodes where node=${pivotNode})"+
                   "from"+
                        "(select"+
                             "distinct"+
                             "meta_distances.*"+
                        "from"+
                             "meta_distances as f1,"+
                             "unknown_nodes as f2"+
                        "where"+
                             "f1.dst=f2.node"+
                             "and"+
                             "f1.src=${pivotNode}"+
                        ")as tmp"
                    );
            //更新unknown_nodes的距离信息
            hive.execute(
                    "insert overwrite table unknown_nodes"+
                    "select"+
                         "f2.node"+
                         "IF(f1.node is null,"+
                              "f2.distance,"+
                              "IF(f1.distance>f2.distance,"+
                                   "f2.distance,"+
                                   "f1.distance"+
                              ")"+
                         ")"+
                    "from"+
                         "tmp_distance_a as f1"+
                    "right outer join"+
                         "unknown_nodes as f2"+
                    "on"+
                         "f1.dst=f2.node"
                 );
            //挑选出最小的node，放入known_nodes中
            hive.execute(
                    "insert into table known_nodes"+
                    "select"+
                         "node,"+
                         "distance"+
                    "from"+
                         "unknown_nodes"+
                    "where"+
                         "distance=min(distance)"
                    );
            //挑选出最小的node，最为下一个pivot_node
            pivotNode=hive.get(
                    "select node"+
                    "from"+
                         "unknown_nodes"+
                    "where"+
                         "distance=min(distance)"
                    );
            //从unknown_nodes中删除最小node
            hive.execute(
                    "insert overwrite into unknown_nodes"+
                    "select *"+
                    "from unkown_nodes"+
                    "where distance!=min(distance)"
                    );
        }
    }
}

也许有人会问：用SQL实现这个算法的意义是什么？它与用常规语言写出来的程序相比，几乎没有任何优势。当数据规模控制在一定范围之内时，我承认是这样的。但是，设想如果我们面对的节点规模是几百万甚至几千万数量级，而我们的机器只有几个G内存时，如何处理？

答案是：Hive+SQL。