SQL版本的Dijkstra最短路径算法
受这篇文章《SQL,NoSQL以及数据库的实质》结尾处题目的启发,我尝试写了一个SQL版本的Dijkstra最短路径算法。算法描述如下:
前提假设:
Hive支持Stored Procedure
或者
Mysql支持Insert into、insert overwrite、create table as select操作
数据结构:
//各个节点之间的距离
Table meta_distances(
src int,
dst int,
distance int
)
//已知最短距离的点(及距离)
Table known_nodes(
node int,
distance int
)
//未知最短距离的点(及暂时的距离)
Table unknown_nodes(
node int,
distance int
)
求节点1到其他所有节点的距离
初始,各表包含的数据:
meta_distances:
各个节点之间的距离
known_nodes:
(1,0)
unknown_nodes:
(2, MaxInt)
(3, MaxInt)
(4, MaxInt)
…...
(n, MaxInt)
pivot_node=1
Create Procedure Dijkstra(IN pivot_node int)
Begin declare unknown_nodes_count int default 1 While unknown_nodes_count>0 do declare min_distance int default 0
select
distance into min_distance
where
node=pivot_node
drop table if exists tmp_distance_a //计算unknown_nodes中每一个node若经过pivot_node,与源点的距离 create table tmp_distance_a as select f1.dst, distance+ min_distancefrom (select distinct meta_distances.* from meta_distances as f1, unknown_nodes as f2 where f1.dst=f2.node and f1.src=pivot_node )as tmp //更新unknown_nodes的距离信息 insert overwrite table unknown_nodes select f2.node IF(f1.node is null, f2.distance, IF(f1.distance>f2.distance, f2.distance, f1.distance ) ) from tmp_distance_a as f1 right outer join unknown_nodes as f2 on f1.dst=f2.node //挑选出最小的node,放入known_nodes中 insert into table known_nodes select node, distance from unknown_nodes where distance=min(distance) //挑选出最小的node,最为下一个pivot_node select node into pivot_node from unknown_nodes where distance=min(distance) //从unknown_nodes中删除最小node
insert overwrite into unknown_nodes
select *
from unkown_nodes
where distance!=min(distance)
//计算unknown_nodes中剩余node的数量 select count(*) into unknown_nodes_count from unknown_nodes End While End
java版本
public class DijkstraSample { public static void compute(int pivotNode){ int unknownNodesCount=1; while(true){ if(hive.get("select count(*) from unknown_nodes")<=0){ //所有点的最短距离都已经计算出 break; } //计算unknown_nodes中每一个node若经过pivot_node,与源点的距离 hive.execute( "drop table if exists tmp_distance_a", "create table tmp_distance_a"+ "as"+ "select"+ "f1.dst,"+ "distance+(select distance from known_nodes where node=${pivotNode})"+ "from"+ "(select"+ "distinct"+ "meta_distances.*"+ "from"+ "meta_distances as f1,"+ "unknown_nodes as f2"+ "where"+ "f1.dst=f2.node"+ "and"+ "f1.src=${pivotNode}"+ ")as tmp" ); //更新unknown_nodes的距离信息 hive.execute( "insert overwrite table unknown_nodes"+ "select"+ "f2.node"+ "IF(f1.node is null,"+ "f2.distance,"+ "IF(f1.distance>f2.distance,"+ "f2.distance,"+ "f1.distance"+ ")"+ ")"+ "from"+ "tmp_distance_a as f1"+ "right outer join"+ "unknown_nodes as f2"+ "on"+ "f1.dst=f2.node" ); //挑选出最小的node,放入known_nodes中 hive.execute( "insert into table known_nodes"+ "select"+ "node,"+ "distance"+ "from"+ "unknown_nodes"+ "where"+ "distance=min(distance)" ); //挑选出最小的node,最为下一个pivot_node pivotNode=hive.get( "select node"+ "from"+ "unknown_nodes"+ "where"+ "distance=min(distance)" ); //从unknown_nodes中删除最小node hive.execute( "insert overwrite into unknown_nodes"+ "select *"+ "from unkown_nodes"+ "where distance!=min(distance)" ); } } }
也许有人会问:用SQL实现这个算法的意义是什么?它与用常规语言写出来的程序相比,几乎没有任何优势。当数据规模控制在一定范围之内时,我承认是这样的。但是,设想如果我们面对的节点规模是几百万甚至几千万数量级,而我们的机器只有几个G内存时,如何处理?
答案是:Hive+SQL。