Dewey Allen
and 1 contributors

NAME

Graph::Dijkstra - Dijkstra computations with methods to input/output graph datasets from/to supported file formats

SYNOPSIS

  use Graph::Dijkstra;
  
  # create the object
  my $graph = Graph::Dijkstra->new();  #create the graph object with default attribute values
  my $graph = Graph::Dijkstra->new( {label=>'my graph label', creator=>'my name', edgedefault=>'undirected'} );  #create the graph object setting the label, creator, and/or edgedefault attibutes
  my $graph = Graph::Dijkstra->inputGraphfromCSV($filename);  #load graph from supported file format creating the graph object.
  
  #SET method to update graph attributes
  $graph->graph( {label=>'my graph label', creator=>'my name', edgedefault='directed'} );
  
  #GET method to return graph attributes in hash (reference)(eg., $graphAttribs->{label}, $graphAttribs->{creator})
  my $graphAttribs = $graph->graph();
  my $graphLabel = $graph->graph()->{label}; #references label attribute of graph
  
  #SET methods to create, update, and delete (remove) nodes and edges
  $graph->node( {id=>0, label=>'nodeA'} );   #create or update existing node.  id must be a simple scalar.
  $graph->node( {id=>1, label=>'nodeB'} );   #label is optional and should be string
  $graph->node( {id=>2, label=>'nodeC'} );
  $graph->edge( {sourceID=>0, targetID=>1, weight=>3, id=>'A', label=>'edge 0 to 1', directed='directed'} );  #create or update edge between sourceID and targetID;  weight (cost) must be > 0
  $graph->edge( {sourceID=>1, targetID=>2, weight=>2} );
  $graph->removeNode( 0 );
  $graph->removeEdge( {sourceID=>0, targetID=1} );
  
  #GET methods for nodes and edges
  my $nodeAttribs = $graph->node( 0 );  #returns hash reference (eg., $nodeAttribs->{id}, $nodeAttribs->{label}) or undef if node id 0 not found
  my $nodeLabel = $graph->node( 0 )->{label}; #references label attribute of node with ID value of 0
  my $bool = $graph->nodeExists( 0 );
  my $edgeAttribs = $graph->edge( { sourceID=>0, targetID=>1} );
  my $edgeWeight = $graph->edge( { sourceID=>0, targetID=>1} )->{weight};  #references weight attribute of edge that connects sourceID to targetID
  my $bool = $graph->edgeExists( { sourceID=>0, targetID=>1 } );
  my @nodes = $graph->nodeList();  #returns array (list) of all nodes in the internal graph, each array element is a hash (reference) containing the node ID and label attributes
  my @nodes = $graph->nodeIDsList();  #returns array (list) of all node ID values in the internal graph, each element is a nodeID value.
  my $bool = $graph->adjacent( { sourceID=>0, targetID=>1 } );
  my @nodes = $graph->adjacentNodes( 0 ); #returns list of node IDs connected by an edge with node ID 0
  
  #methods to input graph from a supported file format
  #inputGraphfrom[Format] methods can also be invoked as class methods which return the graph object on success
  $graph->inputGraphfromGML('mygraphfile.gml');
  $graph->inputGraphfromCSV('mygraphfile.csv');
  my $graph = Graph::Dijkstra->inputGraphfromCSV('mygraphfile.csv');
  $graph->inputGraphfromJSON('mygraphfile.json');   #JSON Graph Specification
  $graph->inputGraphfromGraphML('mygraphfile.graphml.xml', {keyEdgeValueID => 'weight', keyNodeLabelID => 'name'} );
  $graph->inputGraphfromGEXF('mygraphfile.gexf.xml' );
  $graph->inputGraphfromNET('mygraphfile.pajek.net' );   #NET (Pajek) format
  
  #methods to output internal graph to a supported file format
  $graph->outputGraphtoGML('mygraphfile.gml', 'creator name');
  $graph->outputGraphtoCSV('mygraphfile.csv');
  $graph->outputGraphtoJSON('mygraphfile.json');
  $graph->outputGraphtoGraphML('mygraphfile.graphml.xml', {keyEdgeWeightID => 'weight',keyEdgeWeightAttrName => 'weight', keyNodeLabelID => 'name', keyNodeLabelID => 'name'});
  $graph->outputGraphtoGEXF('mygraphfile.gexf.xml');
  $graph->outputGraphtoNET('mygraphfile.pajek.net');
  
  #class methods that validate the contents of XML files against the GraphML and GEXF schemas
  my ($bool, $errmsg) = Graph::Dijkstra->validateGraphMLxml('mygraphfile.graphml.xml');
  my ($bool, $errmsg) = Graph::Dijkstra->validateGEXFxml('mygraphfile.gexf.xml');
  
  #Dijkstra shortest path computation methods
  
  use Data::Dumper;
  my %Solution = ();
  
  #shortest path to farthest node from origin node
  %Solution = (originID => 0);
  if (my $solutionWeight = $graph->farthestNode( \%Solution )) {
        print Dumper(\%Solution);
  }
  
  #shortest path between two nodes (from origin to destination)
  %Solution = (originID => 0, destinationID => 2);
  if (my $pathCost = $graph->shortestPath( \%Solution ) ) {
        print Dumper(\%Solution);
  }
  
  #Jordan or vertex center with all points shortest path (APSP) matrix
  my %solutionMatrix = ();
  if (my $graphMinMax = $graph->vertexCenter(\%solutionMatrix) ) {
        print  {$verboseOutfile} "Center Node Set 'eccentricity', minimal greatest distance to all other nodes $graphMinMax\n";
        print  {$verboseOutfile} "Center Node Set = ", join(',', @{$solutionMatrix{centerNodeSet}} ), "\n";
        
        my @nodeList = (sort keys %{$solutionMatrix{row}});
        print  {$verboseOutfile} 'From/To,', join(',',@nodeList), "\n";
        foreach my $fromNode (@nodeList) {
                print  {$verboseOutfile} "$fromNode";
                foreach my $toNode (@nodeList) {
                        print  {$verboseOutfile} ",$solutionMatrix{row}{$fromNode}{$toNode}";
                }
                print  {$verboseOutfile} "\n";
        }
        $graph->outputAPSPmatrixtoCSV(\%solutionMatrix, 'APSP.csv');
  }
  
  #Alternative implementation of Jordan (vertex center) with APSP matrix method using Floyd Warhsall algorithm
  %solutionMatrix = ();
  my $graphMinMax = $graph->vertexCenterFloydWarshall(\%solutionMatrix);
  
  
  #Misc class methods
  
  #turn on / off verbose output to STDOUT or optional $filehandle
  Dijkstra::Graph->VERBOSE($bool, $filehandle);   
 
  #attribute hashRef to string
  my $attribStr = $graph->stringifyAttribs( $graph->graph() );  #( creator=>'', edgedefault=>'undirected', label=>'' )
  
  #string to attribute hashRef
  my $attribHref = Dijkstra::Graph->hashifyAttribs( $attribStr );   #{'creator' => '', 'edgedefault' => 'undirected', 'label' => '' }
  
  

DESCRIPTION

Efficient implementation of Dijkstras shortest path algorithm in Perl using a Minimum Priority Queue (Array::Heap::ModifiablePriorityQueue**).

Computation methods.

        farthestNode() Shortest path to farthest node from an origin node
        shortestPath() Shortest path between two nodes
        vertexCenter() Jordan center node set (vertex center) with all points shortest path (APSP) matrix*

*Version 0.60 added a second implementation of the vertex center method using the Floyd Warshall algorithm.

File input output methods support the following graph file formats.

        GML (Graph Modelling Language, not to be confused with Geospatial Markup Language or Geography Markup Language), 
        JSON Graph Specification (latest draft specification), 
        GraphML (XML based), 
        GEXF (Graph Exchange XML Format), 
        NET (Pajek), and
        CSV (a simple text based format, "native" to this module ).

Graph::Dijkstra supports undirected and directed graphs including mixed graphs with both types of edges. Does not support loops (edge where sourceID == targetID) or negitive edge weights. Support for directed graphs/edges is new in version 0.50 (pre-production).

In this pre-production release, the internal graph data model is fixed.

        Graph has three attributes: 'label', 'creator', and 'edgedefault'.  'edgedefault' must be either 'directed' or 'undirected' and defaults to 'undirected'.
        
        Nodes (vertices) have three attributes: 'id' (simple scalar, required, unique), 'label' (optional string), and 'edges', a list (hash) of the node id values of connected nodes.
        
        Edges have four required and two optional attributes.  Required edge attributes: 'targetID' and 'sourceID' (node 'id' values that must exist), 'weight'(cost/value/distance/amount), and 'directed'.
        'directed' must be 'directed' or 'undirected' and defaults to the graph 'edgedefault' value. 'weight' must be a positive number (integer or real) greater than 0.
        Optional edge attributes: edge 'id' and edge 'label'.  Note that there is no uniqueness checked for edge 'id' values.  Could be added in later version.

The outputGraphto[file format] methods output data elements from the internal graph. If converting between two supported formats (eg., GML and GraphML), unsupported attributes from the input file (which are not saved in the internal graph) are *not* be written to the output file. Later releases will extend the internal graph data model.

This pre-production release has not been sufficiently tested with real-world graph data sets. It has been tested with rather large datasets (tens of thousands of nodes, hundreds of thousands of edges).

If you encounter a problem or have suggested improvements, please email the author and include a sample dataset. If providing a sample data set, please scrub it of any sensitive or confidential data.

**Array::Heap::ModifiablePriorityQueue, written in Perl, uses Array::Heap, an xs module.

METHODS

Class Methods

Graph::Dijkstra->VERBOSE( $bool, $filehandle );

Class method that turns on or off informational output to STDOUT or optional $filehandle. Called with no parameters, returns current VERBOSE setting. If $filehandle is included, caller is responsible to open and close.

my $graph = Graph::Dijsktra->new();

Create a new, empty graph object. Returns the object on success.

my $graph = Graph::Dijsktra->new( {label=>'my graph label', creator=>'my name', edgedefault=>'undirected'} );

Create a new, empty graph object setting the graph level attributes label, creator, and edgedefault. Returns the object on success.

my $graph = Graph::Dijsktra->inputGraphfromCSV($filename);

Input a graph from a supported file format and return a graph object with graph attributes set from the file values. Works with all supported file formats.

my $attribStr = $graph->stringifyAttribs( $graph->graph() );

Returns a string representation of the attribute hashRef. Sample: ( creator=>'', edgedefault=>'undirected', label=>'' )

my $attribHref = Dijkstra::Graph->hashifyAttribs( $attribStr );

Returns a hash reference from a string representation of an attribute hash. Sample: {'creator' => '', 'edgedefault' => 'undirected', 'label' => '' }

Graph methods

$graph->graph( {label=>'my graph label', creator=>'my name', edgedefault=>'directed'} );

SET method that updates the graph level attributes label and creator.

my $graphAttribs = $graph->graph();

GET method that returns the graph level attributes label and creator in a hash reference (eg., $graphAttribs->{label} $graphAttribs->{creator} $graphAttribs->{edgedefault} );

Node methods

$graph->node( {id=>$id, label=>$label} );

SET method: creates or updates existing node and returns self. Node id values must be simple scalars.

my $nodeAttribs = $graph->node( $id );

GET method: returns the hash (reference) with the id and label values for that node or undef if the node ID does not exist. Note: nodes may have blank ('') labels. Use nodeExists method to test for existance.

my $bool = $graph->nodeExists( $id );

GET method: returns true if a node with that ID values exists or false if not.

my @list = $graph->nodeList();

Returns unsorted array (list) of the nodes in the graph. Each list element is a hash (reference) that contains an "id" and "label" attribute. $list[0]->{id} is the id value of the first node and $list[0]->{label} is the label value of the first node.

my @list = $graph->nodeIDsList();

Returns unsorted array (list) of the node ID values in the graph.

$graph->removeNode( $id );

Removes node identified by $id and all connecting edges and returns self. Returns undef if $id does not exist.

Edge methods

$graph->edge( {sourceID=>$sourceID, targetID=>$targetID, weight=>$weight, id=>'A', label=>'father', directed=>'undirected'} );

SET method: creates or updates existing edge between $sourceID and $targetID and returns $self. $weight must be > 0. If "directed" not set, edge direction defaults to the graphs "edgedefault" value.

Returns undef if $weight <= 0 or if $sourceID or $targetID do not exist.

Carp's and returns undef if update would change the "directed" value of the edge. To change an edge's directed value, delete (removeEdge) first.

my $graphAtrribs = $graph->edge( {sourceID=>$sourceID, targetID=>$targetID} );

GET method: returns hash reference with edge attributes: sourceID, targetID, weight, edge ID, edge label, and edge "directed". "weight" is 0 if there is no edge between sourceID and targetID (and sourceID and targetID both exist). Returns undef if sourceID or targetID do not exist.

my $bool = $graph->edgeExists( { sourceID=>$sourceID, targetID=>$targetID } );

GET method: returns true if an edge connects the source and target IDs or false if an edge has not been defined.

my $bool = $graph->adjacent( { sourceID=>$sourceID, targetID=>$targetID } );

GET method: returns true if an edge connects $sourceID and $targetID or false if not. Returns undef if $sourceID or $targetID do not exist.

my @list = $graph->adjacentNodes( $id );

Returns unsorted list of node IDs connected to node $id by an edge. Returns undef if $id does not exist.

$graph->removeEdge( { sourceID=>$sourceID, targetID=>$targetID } );

Removes edge between $sourceID and $targetID (and $targetID and $sourceID) and returns self. Returns undef if $sourceID or $targetID do not exist.

Dijkstra computation methods

my $solutionWeight = $graph->farthestNode( $solutionHref );

Returns the cost of the shortest path to the farthest node from the origin. Attribute originID must be set in the parameter (hash reference) $solutionHref. Carps and returns 0 if originID attribute not set or if node ID value not found in internal graph. Populates $solutionHref (hash reference) with total weight (cost) of the farthest node(s) from the originID and the edge list from the originID to the farthest node(s). When there is more than one solution (two or more farthest nodes from the origin with the same total weight/cost), the solution hash includes multiple "path" elements, one for each farthest node.

Sample code

        my %Solution = (originID=>'I');
        if ( my $solutionWeight = $graph->farthestNode(\%Solution) ) {
                print Dumper(\%Solution);
                foreach my $i (1 .. $Solution{count}) {
                        print "From originID $Solution{originID}, solution path ($i) to farthest node $Solution{path}{$i}{destinationID} at weight (cost) $Solution{weight}\n";
                        foreach my $edgeHref (@{$Solution{path}{$i}{edges}}) {
                                print "\tsourceID='$edgeHref->{sourceID}' targetID='$edgeHref->{targetID}' weight='$edgeHref->{weight}'\n";
                        }
                }
        }

Produces the following output

        $VAR1 = {
          'weight' => 18,
          'originID' => 'I',
          'desc' => 'farthest',
          'count' => 2,
          'path' => {
                      '1' => {
                               'destinationID' => 'A',
                               'edges' => [
                                            {
                                              'sourceID' => 'I',
                                              'targetID' => 'L',
                                              'weight' => 4
                                            },
                                            {
                                              'weight' => 6,
                                              'targetID' => 'H',
                                              'sourceID' => 'L'
                                            },
                                            {
                                              'sourceID' => 'H',
                                              'targetID' => 'D',
                                              'weight' => 5
                                            },
                                            {
                                              'weight' => 3,
                                              'targetID' => 'A',
                                              'sourceID' => 'D'
                                            }
                                          ]
                             },
                      '2' => {
                               'destinationID' => 'C',
                               'edges' => [
                                            {
                                              'sourceID' => 'I',
                                              'targetID' => 'J',
                                              'weight' => 2
                                            },
                                            {
                                              'weight' => 9,
                                              'targetID' => 'K',
                                              'sourceID' => 'J'
                                            },
                                            {
                                              'targetID' => 'G',
                                              'sourceID' => 'K',
                                              'weight' => 2
                                            },
                                            {
                                              'weight' => 5,
                                              'sourceID' => 'G',
                                              'targetID' => 'C'
                                            }
                                          ]
                             }
                    }
        };

        From originID I, solution path (1) to farthest node A at weight (cost) 18
                sourceID='I' targetID='L' weight='4'
                sourceID='L' targetID='H' weight='6'
                sourceID='H' targetID='D' weight='5'
                sourceID='D' targetID='A' weight='3'
        From originID I, solution path (2) to farthest node C at weight (cost) 18
                sourceID='I' targetID='J' weight='2'
                sourceID='J' targetID='K' weight='9'
                sourceID='K' targetID='G' weight='2'
                sourceID='G' targetID='C' weight='5'
my $solutionWeight = $graph->shortestPath( $solutionHref );

Returns weight (cost) of shortest path between originID and destinationID (set in $solutionHref hash reference) or 0 if there is no path between originID and destinationID. Carps if originID or destinationID not set or node ID values not found in internal graph. Populates $solutionHref (hash reference) with total path weight (cost) and shortest path edge list.

Sample code

        my %Solution = ( originID=>'I', destinationID=>'A' );
        if ( my $pathCost = $graph->shortestPath(\%Solution) ) {
                print Dumper(\%Solution);
                print "Solution path from originID $Solution{originID} to destinationID $Solution{destinationID} at weight (cost) $Solution{weight}\n";
                foreach my $edgeHref (@{$Solution{edges}}) {
                        print "\tsourceID='$edgeHref->{sourceID}' targetID='$edgeHref->{targetID}' weight='$edgeHref->{weight}'\n";
                }
        }
        

Produces the following output

        $VAR1 = {
          'destinationID' => 'A',
          'weight' => 18,
          'desc' => 'path',
          'originID' => 'I',
          'edges' => [
                       {
                         'weight' => 4,
                         'sourceID' => 'I',
                         'targetID' => 'L'
                       },
                       {
                         'targetID' => 'H',
                         'sourceID' => 'L',
                         'weight' => 6
                       },
                       {
                         'targetID' => 'D',
                         'weight' => 5,
                         'sourceID' => 'H'
                       },
                       {
                         'weight' => 3,
                         'sourceID' => 'D',
                         'targetID' => 'A'
                       }
                     ]
        };
        
        Solution path from originID I to destinationID A at weight (cost) 18
                sourceID='I' targetID='L' weight='4'
                sourceID='L' targetID='H' weight='6'
                sourceID='H' targetID='D' weight='5'
                sourceID='D' targetID='A' weight='3'
my $graphMinMax = $graph->vertexCenter($solutionMatrixHref);

Returns the graph "eccentricity", the minimal greatest distance to all other nodes from the "center node" set or Jordan center. Carps if graph contains disconnected nodes (nodes with no edges) which are excluded. If graph contains a disconnected sub-graph (a set of connected nodes isoluated / disconnected from all other nodes), the return value is 0 -- as the center nodes are undefined.

The $solutionMatrix hash (reference) is updated to include the center node set (the list of nodes with the minimal greatest distance to all other nodes) and the all points shortest path matrix. In the all points shortest path matrix, an infinite value (1.#INF) indicates that there is no path from the origin to the destination node. In this case, the center node set is empty and the return value is 0.

Includes a class "helper" method that outputs the All Points Shortest Path matrix to a CSV file.

NOTE: The size of the All Points Shortest Path matrix is nodes^2 (expontial). A graph with a thousand nodes results in a million entry matrix that will take a long time to compute. Have not evaluated the practical limit on the number of nodes.

Sample code

        my %solutionMatrix = ();
        
        my $graphMinMax = $graph->vertexCenter(\%solutionMatrix);
        print "Center Node Set = ", join(',', @{$solutionMatrix{centerNodeSet}} ), "\n";
        print "Center Node Set 'eccentricity', minimal greatest distance to all other nodes $graphMinMax\n";
        
        my @nodeList = (sort keys %{$solutionMatrix{row}});
        #  or (sort {$a <=> $b} keys %{$solutionMatrix{row}}) if the $nodeID values are numeric
        
        print 'From/To,', join(',',@nodeList), "\n";
        foreach my $fromNode (@nodeList) {
                print "$fromNode";
                foreach my $toNode (@nodeList) {
                        print ",$solutionMatrix{row}{$fromNode}{$toNode}";
                }
                print "\n";
        }
        
        # Output All Points Shortest Path matrix to a .CSV file.  
        # If the nodeID values are numeric, include a third parameter, 'numeric' to sort the nodeID values numerically.
        
        $graph->outputAPSPmatrixtoCSV(\%solutionMatrix, 'APSP.csv');
my $graphMinMax = $graph->vertexCenterFloydWarshall($solutionMatrixHref);

Same as inputs and outputs as vertexCenter method.

Implementation of the Floyd Warshall alogithm with an author included performance tweak: skips nodes with no outbound edges (node outdegree = 0) as the distance from these nodes to all other nodes is infinite.

Input graph methods

Note: all inputGraphfrom[format] methods can be called as class methods (eg., my $graph = Graph::Dijkstra->inputGraphfromJSON($filename); )

$graph->inputGraphfromJSON($filename, {edgeWeightKey=>'value'});

Inputs nodes and edges from a JSON format file following the draft JSON Graph Specification. Optional hash reference following filename may contain a single attribute, edgeWeightKey, used to identify the edge weight attribute in the metadata section. Default value is "value".

Supports single graph files only. JSON Graph Specification files using the "Graphs" (multi-graph) attribute are not supported.

If the Graph metadata section includes the "creator" attribute, sets the internal graph attribute "creator". If the edge metadata section includes an id value, sets the edge id value.

Edge values/weights/costs are input using the edge metadata edgeWeightKey ("value") attribute and edge ids using the id value. Example edge that includes metadata value attribute per JSON Graph Specification. { "source" : "1", "target" : "2", "metadata" : { "id" : "A", "value" : "0.5" } },

See JSON Graph Specification https://github.com/jsongraph/json-graph-specification

$graph->inputGraphfromGML($filename);

Inputs nodes and edges from a Graphics Modelling Language format file (not to be confused with the Geospatial Markup Language XML format). Implemented using pattern matching (regexp's) on "node" and "edge" constructs. An unmatched closing bracket (']') inside a quoted string attribute value will break the pattern matching. Quoted string attribute values (e.g., a label value) should not normally include an unmatched closing bracket. Report as a bug and I'll work on re-implementing using a parser.

See Graph Modelling Language https://en.wikipedia.org/wiki/Graph_Modelling_Language

$graph->inputGraphfromCSV($filename);

Inputs nodes and edges from a CSV format file developed by the author. The first column in each "row" is "graph", "node" or "edge". Subsequent columns are attribute value pairs (eg., attrib=>'value'). Only non-blank attributes are included.

Example

        graph,edgedefault=>'undirected'
        node,id=>'A',label=>'one'
        node,id=>'C',label=>'three'
        node,id=>'B',"label=>'node B label'"
        node,id=>'D',"label=>'node D label'"
        edge,directed=>'directed',id=>4,sourceID=>'A',targetID=>'D',weight=>3
        edge,directed=>'undirected',id=>5,sourceID=>'A',targetID=>'B',weight=>4
        edge,directed=>'undirected',id=>3,sourceID=>'C',targetID=>'B',weight=>7
        edge,directed=>'undirected',id=>2,"label=>'Test Test'",sourceID=>'C',targetID=>'D',weight=>3
        edge,directed=>'undirected',id=>1,sourceID=>'B',targetID=>'D',weight=>5
        
$graph->inputGraphfromGraphML($filename, {keyEdgeValueID => 'weight', keyNodeLabelID => 'name'} );

Inputs nodes and edges from an XML format file following the GraphML specification.

Input files must contain only a single graph and cannot contain embedded graphs. Hyperedges are not supported.

The options hash reference (second parameter following the filename) is used to provide the key element ID values for edge weight/value/cost/distance and node label/name/description.

If either is not provided, the method will search the key elements for (1) edge attributes (for="edge") with an attr.name value of weight, value, cost, or distance; and (2) node attributes (for="node") with an attr.name value of label, name, description, or nlabel.

Graphs must contain a "key" attribute for edges that identifies the edge weight/value/cost/distance such as for="edge" attrib.name="weight". If this key element includes a child element that specifies a default value, that default value will be used to populate the weight (cost/value/distance) for each edge node that does not include a weight/value/cost/distance data element. Seems odd to specify a default edge weight but it will be accepted.

  <key id="d1" for="edge" attr.name="weight" attr.type="double">
    <default>2.2</default>
  </key>

        <edge id="7" source="1" target="2">
                <data key="weight">0.5</data>
        </edge>

Graphs should contain a "key" attribute for nodes that identifies the node label / name / description such as for="node" attrib.name="name" or for="node" attrib.name="label". These are used to populate the internal graph "label" value for each node. If not included, the internal node labels will be empty strings.

        <key id="name" for="node" attr.name="name" attr.type="string"/>
        
        <node id="4">
                <data key="name">josh</data>
        </node>

See GraphML Primer http://graphml.graphdrawing.org/primer/graphml-primer.html and GraphML example http://gephi.org/users/supported-graph-formats/graphml-format/

$graph->inputGraphfromGEXF( $filename );

Inputs nodes and edges from an XML format file following the GEXF draft 1.2 specification. Will also accept draft 1.1 specification files.

Files with multiple graphs are not supported. Hierarchial nodes are not supported. Croaks if either are detected.

Carps on files with a graph element attribute mode="dynamic".

Node elements are expected to contain a label element.

Edge elements should contain a weight attribute but will default to 1. Edge weights are input from either the edge element "weight" attribute or from an attvalues/attvalue element as follows.

        <attvalues>
                <attvalue for="weight" value="1.0"></attvalue>
        </attvalues>
$graph->inputGraphfromNET( $filename );

Input nodes and edges from NET (Pajek) format files. Inputs nodes from *Vertices section and edges from *Edges and *Arcs sections. All other sections are ignored including: *Arcslist and *Edgeslist. Edges are undirected and arcs are directed.

NET (Pajek) files with multiple (time based) Vertices and Edges sections are not supported.

Output graph methods

$graph->outputGraphtoGML($filename, $creator);

Using the internal graph, outputs a file in GML format. Includes a "creator" entry on the first line of the file with a date and timestamp. Note that non-numeric node IDs and label values are HTML encoded.

$graph->outputGraphtoJSON( $filename, {edgeWeightKey='value'} );

Using the internal graph, outputs a file following the JSON graph specification. In the Graph metadata section, includes a comment attribute referencing this module and the local time that the JSON document was created. Uses the edgeWeightKey value (which defaults to "value") to output edge weights in the edge metadata section. See the inputGraphfromJSON method for format details.

$graph->outputGraphtoCSV( $filename );

Using the internal graph, outputs a file in CSV format. See the inputGraphfromCSV method for format details.

$graph->outputGraphtoGraphML($filename, {keyEdgeWeightID => '',keyEdgeWeightAttrName => '', keyNodeLabelID => '', keyNodeLabelID => ''} );

Using the internal graph, outputs a file in XML format following the GraphML specification (schema). The option attributes keyEdgeWeightID and keyEdgeWeightAttrName both default to 'weight'. keyNodeLabelID and keyNodeLabelID both default to 'name'. Set these attributes values only if you need to output different values for these key attributes.

$graph->outputGraphtoGEXF( $filename );

Using the internal graph, outputs a file in XML format following the GEXF draft 1.2 specification (schema).

$graph->outputGraphtoNET( $filename );

Using the internal graph, outputs a file in NET (Pajek) format. For node IDs, the NET format uses consecutive (sequential) numeric (integer) values (1 .. # of nodes). If the internal graph uses sequential numeric IDs, these will be preserved in the output file. Otherwise, the existing node IDs are mapped to sequentially numbered IDs that are output. This preserves the graph node and edge structure but necessarily looses the existing node ID values.

Undirected edges are output in the *Edges section and directed edges in the *Arcs section.

Input/Output class methods

my ($bool, $errmsg) = Graph::Dijkstra->validateGraphMLxml( $filename );

Validates contents of $filename against GraphML XML schema http://graphml.graphdrawing.org/xmlns/1.1/graphml.xsd $bool is true (1) and $errmsg is empty if file contents are validated against schema. Otherwise, $bool is false (0) and $errmsg is set.

my ($bool, $errmsg) = Graph::Dijkstra->validateGEXFxml( $filename );

Validates contents of $filename against GEXF XML schema http://www.gexf.net/1.2draft/gexf.xsd $bool is true (1) and $errmsg is empty if file contents are validated against schema. Otherwise, $bool is false (0) and $errmsg is set.

CHANGE HISTORY

Version 0.3

        o Initial development release (first release that indexed correctly on CPAN)

Version 0.4

        o Added input/output methods for Pajek (NET) format files
        o Lots of incompatible changes.
        o Changed references to edge attribute labels to consistently use: sourceID, targetID, and weight.  
        o In the farthestNode and shortestPath methods, changed origin and destination to originID and destinationID as the starting and endpoint node ID values.
        o Changed the node, edge, removeEdge, adjacent, and edgeExists methods to use hash references as parameters.  Get version of node method returns hash reference.
                > Thought is that using hash references as parameters will better support future addition of graph, node, and edge attributes.
        o Changed the farthestNode and shortestPath methods to input the node ID value(s) in the solution hash reference parameter as "originID" and "destinationID".
        o Changed the solution hash reference returned by the farthestNode, shortestPath methods to use sourceID, targetID, and weight as hash attributes replacing source, target, and cost
        o Added two graph level attributes: label and creator.  Attributes are input / output from / to files as supported by that format.
        o Updated new method to accept an optional parameter hash (reference) with graph attributes
        o Added graph method to update (set) or return (get) graph attributes.
        o In files produced by the outputGraphto[file format] methods (exlcuding CSV files), added a comment "Generated by Graph::Dijkstra on [localtime string]".  In JSON, comment is a "metadata" attribute of the Graph object.
        o Validated that JSON text created by outputGraphtoJSON conforms to the JSON Graph Specification schema. 
        o Always bug fixes.
        o Updated test scripts.
        

Version 0.41

        o Updated edge (GET) method to return a hash reference with three attributes: sourceID, targetID, and weight.

Version 0.50

        o Added initial support for directed edges throughout (in addition to undirected). Not extensively tested.
        o Added three attributes to edge: 'id', 'label', and 'directed' (directed/undirected).
        o Added graph attribute 'edgedefault' (directed/undirected).
        o Added three readonly attribute hashes with default values for graph, node, and edge elements.  Expectation is that new attributes can be supported by updating the default hashes.
        o Updated format for CSV files (incompatible change) and rewrote associated input/output methods.
        o Added two "helper" class methods: stringifyAttribs and hashifyAttribs.
        o Added utf8::upgrade calls for all graph, node, and edge attribute values.
        o Added tests for directed and undirected edges.
        

Version 0.55

        o Modified inputGraphfrom[format] methods to also work as class methods that create and populate an internal graph instance.
        o Updated (bug fix) outputGraphtoGEXF to include version attribute in gexf element as required by schema.
        o Updated (bug fix) outputGraphtoGraphML and inputGraphfromGraphML to output and input graph element label values using the key element id / data key value "graphlabel" to avoid conflict with the edge label id value ("label")
        o Added two class methods, validateGraphMLxml and validateGEXFxml, that validate the contents of XML files against the GraphML and GEXF schemas.
        o Used the two new class validate[format]xml methods to validate the contents of xml files produced by outputGraphtoGraphML and outputGraphtoGEXF
        

Version 0.56

        o Modified VERBOSE method to accept an optional filehandle to redirect informational output.
        o Modified inputGraphfromGEXF method to input weight from either edge attribute weight (weight="9.9") or edge attvalues/attvalue element C<< <attvalue for="weight" value="9.9" /> >>.
        o Fixed bug in computation methods related to nodes with inbound but no outbound edges (directed))
        o Corrected / updated documentation (POD).

Version 0.60

        o Added Floyd Warshall variant of vertexCenter method and added tests.
        o Refactored (substantially rewrote) edge method and added tests.
        o Updated validateGraphMLxml and validateGEXFxml class methods to check for schema parse errors indicating that schema URL is not available; updated tests.
        o Documentation (POD) updates.
        

Version 0.70

        o Refactored internal representation of graph edges and nodes: separate hash structures within the graph object
        o Fixed subtle bug in removeNode method
        o Added nodeIDsList method and integrated into outputGraphto* methods replacing direct access to internal graph object data
        o Added two private methods, _getEdgeAttrib and _edgeExists, and integrated into computation and outputGraphto* methods replacing direct access to internal graph object data
        

PLATFORM DEPENDENCIES

Critical module depencies are XML::LibXML and Array::Heap::ModifiablePriorityQueue which itself uses Array::Heap. XML::LibXML and Array::Head are XS modules. XML::LibXML requires the c library libxml2.

For use with very large XML based graph files (GEXF or GraphML), recommend 64bit versions of Perl and the libxml2 (c) library. See the "Performance" section.

PERFORMANCE

Performance measurements were recorded on an unremarkable laptop (Intel Core i5) running Microsoft Windows 10 (64bit) and ActiveState Perl 5.20.2 (x86). Reported times are Perl "use benchmark" measurements of the runtime of the core algorithm within the referenced methods. Timings exclude data loading. Measurements are indicative at best.

With a test graph of 16+K nodes and 121+K edges, both farthest and shortestPath completed in under 1 second (~0.45seconds).

The vertexCenter method is implemented using Dijkstra's shortest path algorithm which provides reasonable performance on connected graphs (where every node is reachable). On sparsely (weakly) connected graphs (not every node pair is connected), the runtime substantially increases as the algorithm repeatedly attempts to compute shortest paths between nodes not connected by edges. The Floyd Warshall implementation (vertexCenterFloydWarshall) includes a tweak that improves performance (reduces runtime) on sparsely connected graphs for nodes with no outbound edges (outdegree = 0). On connected graphs, Dijkstra should run faster than Floyd Warshall. In all other cases, Floyd Warshall should run faster.

For both implementations of vertexCenter, runtime is exponential (O3) based on the number of nodes. Comptemplating adding a limit on the number of nodes, probably 1,000.

With a GraphML (XML) dataset containing over 700K nodes and 1M edges (>6M lines of text, ~175MB file size), the perl process ran out of memory (exceeded the 2GB per process limit for 32bit applications under 64bit MS Windows). The memory allocation limit was reached in the libxml2 (c) library before control was returned to Perl. Using the 64bit version of Perl should eliminate this problem. The GraphML file is available at http://sonetlab.fbk.eu/data/social_networks_of_wikipedia/ under the heading "Large Wikipedias (2 networks extracted automatically with the 2 algorithms)". Filename is eswiki-20110203-pages-meta-current.graphml.

LIMITATIONS / KNOWN ISSUES

Node ID values must be simple scalars.

Some inputGraphfrom[format] methods default edge weights to 1.

For speed and simplicity, input of GML format files (method inputGraphfromGML) implemented using pattern matching (regexp's). An unmatched closing bracket (']') inside a quoted string (value) will break it. For testing, implemented an alternative using Parse::Recdescent (recursive descent) that eliminated the "unmatched closing bracket insided a quoted string" problem. Unfortunately, performance was unacceptably slow (very bad) on large graph files (10K+ nodes, 100K+ thousand edges). Will continue to evaluate alternatives.

At the time v0.60 was released to CPAN, the GraphML site http://graphml.graphdrawing.org/ had been down for 12+ hours. This prevents the validateGraphMLxml from functioning (as the schema URL is not available).

TODOs

Add data attributes including:

        node graph coordinates (xy coordinates and/or lat/long),
        node and edge style (eg., line style, color)

Review and update inputGraphfrom[format] methods to consistently set default edge weight to 1 or allow caller to provide default edge weight.

Test very large graph datasets using a 64bit version of perl (without the 2GB process limit).

Add validateJSONgraph class method that validates contents of JSON file against JSON graph specification (dependent upon JSV::Validator package installing correctly)

Evaluate and refactor code to move the input/output graph functions to one or more separate packages to reduce the code size of the Graph::Dijkstra package. For example, put the input graph from file methods in a separate package and refactor to return a graph object. For example:

        use Graph::Dijkstra;
        use Graph::Dijkstra::Input;
        
        my $graph = Graph::Dijkstra::Input::inputGraphfromGraphML($filename);
        
        #or maybe as functional call
        
        my $graph = inputGraphfromGraphML($filename);

Evaluate support for user defined graph, node, and edge attributes (metadata).

Continue to evaluate performance of vertexCenter methods. Add a progress indicator triggered by the verbose setting.

Input welcome. Please email author with suggestions. Graph data sets for testing (purged of sensitive/confidential data) are welcome and appreciated.

SEE ALSO

Array::Heap::ModifiablePriorityQueue

Graph Modelling Language https://en.wikipedia.org/wiki/Graph_Modelling_Language

JSON Graph Specification https://github.com/jsongraph/json-graph-specification

GraphML Primer http://graphml.graphdrawing.org/primer/graphml-primer.html

GraphML example http://gephi.org/users/supported-graph-formats/graphml-format/

GEXF file format http://www.gexf.net/format/index.html

NET (Pajek) File format https://gephi.org/users/supported-graph-formats/pajek-net-format/

AUTHOR

D. Dewey Allen <ddallen16@gmail.com>

COPYRIGHT/LICENSE

Copyright (C) 2015, D. Dewey Allen

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See perlartistic.

DISCLAIMER OF WARRANTY

BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION.

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.