Declarations: This scheme is an alternative in case the implementation of gremlinSQL scheme is blocked by spark's direct execution. It does not involve working secrets, there is no possibility of leaking secrets, it is purely personal reflection, and I hope to make a contribution
Scheme: Converts the query result of gremlinSql to startGraph, then writes it to HDFS, spark reads starGraphJSon of HDFS to build graphx usable graph, and then calls graphx rich graph calculation algorithm; this will achieve the purpose of graphX operation janusgraph
1. The results of the gremlinSql query are converted to starGraphJson
Since the graphSon format saved by org.apache.tinkerpop.gremlin.structure.io.graphson.GraphSONWriter does not meet the requirements, the queried Point-Edge data with path is converted into a single-point graph json structure by itself. The conversion method is as follows, then saved in hdfs, and the storage method is not described in detail.
public StringBuilder generatorStarGraphJson(Vertex vertex, Edge edge, StringBuilder starGraphJson){ String inVId; String outVId; String VId; starGraphJson.append("{").append("\"id\":"+vertex.id()+","+"\"label\":\""+vertex.label()+"\","); //In this case there will be outE and inE, outE will have inV information, inE will have an inV inVId = edge.inVertex().id().toString(); outVId = edge.outVertex().id().toString(); VId = vertex.id().toString(); if(inVId.equalsIgnoreCase(VId)){ starGraphJson.append("\"outE\":{").append("\""+edge.label()+"\":[{").append("\"id\":\""+edge.id()+"\",") .append("\"inV\":"+edge.inVertex().id()+",").append("\"properties\":{"+concatEdgeProperties(edge)+"}}]},"); }else if(outVId.equalsIgnoreCase(VId)){ starGraphJson.append("\"inE\":{").append("\""+edge.label()+"\":[{").append("\"id\":\""+edge.id()+"\",") .append("\"outV\":"+edge.inVertex().id()+",").append("\"properties\":{"+concatEdgeProperties(edge)+"}}]},"); }else{ throw new Exception("Point edge mismatch data error!!!"); } //properties of stitching points starGraphJson.append("\"properties\":{").append(concatVertexProperties(vertex)).append("}}"); return starGraphJson; }
2. spark reads the starGraph to graph of the specified path
class GraphSon2GraphXRDD() extends Serializable { def getGraphConf(HDFSFilePath : String): BaseConfiguration ={ val inputGraphConf = new BaseConfiguration inputGraphConf.setProperty("gremlin.graph", classOf\[HadoopGraph\].getName) inputGraphConf.setProperty(Constants.GREMLIN\_HADOOP\_GRAPH\_READER, classOf\[GraphSONInputFormat\].getName) inputGraphConf.setProperty(Constants.GREMLIN\_HADOOP\_INPUT\_LOCATION, HDFSFilePath) inputGraphConf.setProperty(Constants.MAPREDUCE\_INPUT\_FILEINPUTFORMAT_INPUTDIR, HDFSFilePath) inputGraphConf } def getSc(sparkHost:String ,isRemote:Boolean): SparkContext ={ var sparkConf = new SparkConf() if(isRemote){ //To Improve} else{sparkConf.setMaster ("local\[*]"). setAppName ("GraphSon2GraphX")} Val SC = new SparkContext (sparkConf) SC} def getJavaRDD(conf : BaseConfiguration, sc : SparkContext): JavaPairRDD\[AnyRef, VertexWritable\] ={ val jsc = JavaSparkContext.fromSparkContext(sc) val graphRDDInput = new InputFormatRDD val vertexWritableJavaPairRDD = graphRDDInput.readGraphRDD(conf, jsc) vertexWritableJavaPairRDD } def getVertexRDD(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\]): RDD\[(Long,util.HashMap\[String,java.io.Serializable\])\] ={ vertexWritableJavaPairRDD.rdd.map((tuple2: Tuple2\[AnyRef, VertexWritable\]) => { // Get the center vertex val v = tuple2._2.get val g = StarGraph.of(v) // In case the vertex id in TinkerGraph is not long type // val vid = convertStringIDToLongID([v.id](http://v.id)().toString) val vid = [v.id](http://v.id)().toString.toLong // Pass the vertex properties to GraphX vertex value map and remain the original vertex id var graphxValueMap : util.HashMap\[String,java.io.Serializable\] = new util.HashMapString,java.io.Serializable graphxValueMap.put("originalID",[v.id](http://v.id)().toString) graphxValueMap.putAll(g.traversal.V([v.id](http://v.id)).valueMap().next(1).get(0)) (vid,graphxValueMap) }) } def getEdgeRDD(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\]): RDD\[graphx.Edge\[util.HashMap\[String, java.io.Serializable\]\]\] ={ val edge = vertexWritableJavaPairRDD.rdd.flatMap((tuple2: Tuple2\[AnyRef, VertexWritable\]) => { val v = tuple2._2.get val g = StarGraph.of(v) val edgelist:util.List\[Edge\] = g.traversal.V([v.id](http://v.id)).outE().toList // Put all edges of the center vertex into the list val list = new collection.mutable.ArrayBuffer[graphx.Edge[util.HashMap[String,java.io.Serializable]]]() var x = 0 for(x <- 0 until edgelist.size()){ var srcId = edgelist.get(x).inVertex.id().toString var dstId = edgelist.get(x).outVertex.id().toString // val md1 = convertStringIDToLongID(srcId) // val md2 = convertStringIDToLongID(dstId) val md1 = srcId.toLong val md2 = dstId.toLong // Get the properties of the edge var edgeAttr = new util.HashMap[String,java.io.Serializable]() var perporties : util.Iterator[Property[Nothing]] = edgelist.get(x).properties() while(perporties.hasNext){ val property = perporties.next() edgeAttr.put(property.key(),property.value().toString) } list.append(graphx.Edge(md1,md2,edgeAttr)) } list }) val edgeRDD = edge.distinct() edgeRDD } def doLAP(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\], iterationNum : Int): Array\[Array\[String\]\] = { val vertexRDD = getVertexRDD(vertexWritableJavaPairRDD) val edgeRDD = getEdgeRDD(vertexWritableJavaPairRDD) val graph = graphx.Graph[util.HashMap[String,java.io.Serializable], util.HashMap[String,java.io.Serializable]](vertexRDD,edgeRDD,new util.HashMap[String,java.io.Serializable]()) val LVMRsult = lib.LabelPropagation.run(graph , iterationNum).vertices.collect.sortWith (_._1 < _._1).map(f => { println(f.toString()) f}) getFinalCommunit(LVMRsult) } def getFinalCommunit(LVMRsult:Array\[(Long,Long)\]): Array\[Array\[String\]\] ={ var result = new Array[Array\[String\]](LVMRsult.length) var tmp = new ArrayBufferString for(i <- 0 until LVMRsult.length){ var k = 0 val array = new ArrayBufferString //Community contains multiple values for(j <- (i+1) until LVMRsult.length) { if(LVMRsult(i)._2.equals(LVMRsult(j)._2)){ if(!tmp.contains(LVMRsult(i)._1.toString)){ array += LVMRsult(i)._1.toString tmp += LVMRsult(i)._1.toString } if(!tmp.contains(LVMRsult(j)._1.toString)){ array += LVMRsult(j)._1.toString tmp += LVMRsult(j)._1.toString } k = k+1 } } //Being a Community if(k.equals(0)){ if(!tmp.contains(LVMRsult(i)._1.toString)){ array += LVMRsult(i)._1.toString tmp += LVMRsult(i)._1.toString } } if(array.length > 0){ result.update(i,array.toArray.distinct) } } result.filter(f => { println(if (f.length >0) f.mkString("(",",",")")) f != null }) } def doPageRank(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\], stopThreshold : Double): Array\[Array\[Any\]\] = { val vertexRDD:RDD\[(Long,util.HashMap\[String,java.io.Serializable\])\] = getVertexRDD(vertexWritableJavaPairRDD) val edgeRDD = getEdgeRDD(vertexWritableJavaPairRDD) val graph = graphx.Graph[util.HashMap[String,java.io.Serializable], util.HashMap[String,java.io.Serializable]](vertexRDD,edgeRDD,new util.HashMap[String,java.io.Serializable]()) val gpgraph = graph.pageRank(stopThreshold).cache() val titleAndPrGraph = graph.outerJoinVertices(gpgraph.vertices) { (v, title, rank) => (rank.getOrElse(0.0), title) } //Reversed false positive order true // titleAndPrGraph.vertices.sortBy((entry: (VertexId, (Double, Object))) => entry.\_2.\_1, false).foreach(f => println(f.\_1+":"+f.\_2._1)) val pageRank = titleAndPrGraph.vertices.sortBy((entry: (VertexId, (Double, Object))) => entry._2._1, false).map(f => { println(f._1+":"+f._2._1) Array(f._1.toString,f._2._1) }) pageRank.collect() } }
Thus, janusgraph and graphx run through, calling graphx's rich graph calculation function is unimpeded, that is, the implementation is a little frustrated, hoping to throw a brick and start a work