SQL版本的Dijkstra最短路径算法

受这篇文章《SQL,NoSQL以及数据库的实质》结尾处题目的启发,我尝试写了一个SQL版本的Dijkstra最短路径算法。算法描述如下:

前提假设:
     Hive支持Stored Procedure
     或者
     Mysql支持Insert into、insert overwrite、create table as select操作
 
数据结构:
//各个节点之间的距离
Table meta_distances(
     src int,
     dst int,
     distance int
)
//已知最短距离的点(及距离)
Table known_nodes(
     node int,
     distance int
)
//未知最短距离的点(及暂时的距离)
Table unknown_nodes(
     node int,
     distance int
)
求节点1到其他所有节点的距离
 
初始,各表包含的数据:
meta_distances:
     各个节点之间的距离
known_nodes:
     (1,0)
unknown_nodes:
     (2, MaxInt)
     (3, MaxInt)
     (4, MaxInt)
     …...
     (n, MaxInt)
 
     pivot_node=1
Create Procedure Dijkstra(IN pivot_node int)
Begin
declare unknown_nodes_count int default 1 While unknown_nodes_count>0 do declare min_distance int default 0
         
         select
            distance into min_distance
         where
            node=pivot_node
         
drop table if exists tmp_distance_a //计算unknown_nodes中每一个node若经过pivot_node,与源点的距离 create table tmp_distance_a as select f1.dst, distance+ min_distancefrom (select distinct meta_distances.* from meta_distances as f1, unknown_nodes as f2 where f1.dst=f2.node and f1.src=pivot_node )as tmp //更新unknown_nodes的距离信息 insert overwrite table unknown_nodes select f2.node IF(f1.node is null, f2.distance, IF(f1.distance>f2.distance, f2.distance, f1.distance ) ) from tmp_distance_a as f1 right outer join unknown_nodes as f2 on f1.dst=f2.node //挑选出最小的node,放入known_nodes中 insert into table known_nodes select node, distance from unknown_nodes where distance=min(distance) //挑选出最小的node,最为下一个pivot_node select node into pivot_node from unknown_nodes where distance=min(distance) //从unknown_nodes中删除最小node
         insert overwrite into unknown_nodes
         select *
         from unkown_nodes
         where distance!=min(distance)

         //
计算unknown_nodes中剩余node的数量 select count(*) into unknown_nodes_count from unknown_nodes End While End

java版本

public class DijkstraSample {
    public static void compute(int pivotNode){
        int unknownNodesCount=1;
        while(true){
            if(hive.get("select count(*) from unknown_nodes")<=0){
                //所有点的最短距离都已经计算出
                break;
            }
            //计算unknown_nodes中每一个node若经过pivot_node,与源点的距离
            hive.execute(
                    "drop table if exists tmp_distance_a",
                    
                    "create table tmp_distance_a"+
                   "as"+
                   "select"+
                        "f1.dst,"+
                        "distance+(select distance from known_nodes where node=${pivotNode})"+
                   "from"+
                        "(select"+
                             "distinct"+
                             "meta_distances.*"+
                        "from"+
                             "meta_distances as f1,"+
                             "unknown_nodes as f2"+
                        "where"+
                             "f1.dst=f2.node"+
                             "and"+
                             "f1.src=${pivotNode}"+
                        ")as tmp"
                    );
            //更新unknown_nodes的距离信息
            hive.execute(
                    "insert overwrite table unknown_nodes"+
                    "select"+
                         "f2.node"+
                         "IF(f1.node is null,"+
                              "f2.distance,"+
                              "IF(f1.distance>f2.distance,"+
                                   "f2.distance,"+
                                   "f1.distance"+
                              ")"+
                         ")"+
                    "from"+
                         "tmp_distance_a as f1"+
                    "right outer join"+
                         "unknown_nodes as f2"+
                    "on"+
                         "f1.dst=f2.node"
                 );
            //挑选出最小的node,放入known_nodes中
            hive.execute(
                    "insert into table known_nodes"+
                    "select"+
                         "node,"+
                         "distance"+
                    "from"+
                         "unknown_nodes"+
                    "where"+
                         "distance=min(distance)"
                    );
            //挑选出最小的node,最为下一个pivot_node
            pivotNode=hive.get(
                    "select node"+
                    "from"+
                         "unknown_nodes"+
                    "where"+
                         "distance=min(distance)"
                    );
            //从unknown_nodes中删除最小node
            hive.execute(
                    "insert overwrite into unknown_nodes"+
                    "select *"+
                    "from unkown_nodes"+
                    "where distance!=min(distance)"
                    );
        }
    }
}

也许有人会问:用SQL实现这个算法的意义是什么?它与用常规语言写出来的程序相比,几乎没有任何优势。当数据规模控制在一定范围之内时,我承认是这样的。但是,设想如果我们面对的节点规模是几百万甚至几千万数量级,而我们的机器只有几个G内存时,如何处理?

答案是:Hive+SQL。

原文地址:https://www.cnblogs.com/linghuaichong/p/4367055.html