Graphx processing janusGraph data implementation

Keywords: Programming Java Spark Apache JSON

Declarations: This scheme is an alternative in case the implementation of gremlinSQL scheme is blocked by spark's direct execution. It does not involve working secrets, there is no possibility of leaking secrets, it is purely personal reflection, and I hope to make a contribution

Scheme: Converts the query result of gremlinSql to startGraph, then writes it to HDFS, spark reads starGraphJSon of HDFS to build graphx usable graph, and then calls graphx rich graph calculation algorithm; this will achieve the purpose of graphX operation janusgraph

1. The results of the gremlinSql query are converted to starGraphJson

Since the graphSon format saved by org.apache.tinkerpop.gremlin.structure.io.graphson.GraphSONWriter does not meet the requirements, the queried Point-Edge data with path is converted into a single-point graph json structure by itself. The conversion method is as follows, then saved in hdfs, and the storage method is not described in detail.

public StringBuilder generatorStarGraphJson(Vertex vertex, Edge edge, StringBuilder starGraphJson){
	String inVId;
	String outVId;
	String VId;
	starGraphJson.append("{").append("\"id\":"+vertex.id()+","+"\"label\":\""+vertex.label()+"\",");
	//In this case there will be outE and inE, outE will have inV information, inE will have an inV
	inVId = edge.inVertex().id().toString();
	outVId = edge.outVertex().id().toString();
	VId = vertex.id().toString();
	if(inVId.equalsIgnoreCase(VId)){
		starGraphJson.append("\"outE\":{").append("\""+edge.label()+"\":[{").append("\"id\":\""+edge.id()+"\",")
				.append("\"inV\":"+edge.inVertex().id()+",").append("\"properties\":{"+concatEdgeProperties(edge)+"}}]},");
	}else if(outVId.equalsIgnoreCase(VId)){
		starGraphJson.append("\"inE\":{").append("\""+edge.label()+"\":[{").append("\"id\":\""+edge.id()+"\",")
				.append("\"outV\":"+edge.inVertex().id()+",").append("\"properties\":{"+concatEdgeProperties(edge)+"}}]},");
	}else{
		throw new Exception("Point edge mismatch data error!!!");
	}
	//properties of stitching points
	starGraphJson.append("\"properties\":{").append(concatVertexProperties(vertex)).append("}}");
	return  starGraphJson;
}

2. spark reads the starGraph to graph of the specified path

class GraphSon2GraphXRDD() extends Serializable {

def getGraphConf(HDFSFilePath : String): BaseConfiguration ={ val inputGraphConf = new BaseConfiguration inputGraphConf.setProperty("gremlin.graph", classOf\[HadoopGraph\].getName) inputGraphConf.setProperty(Constants.GREMLIN\_HADOOP\_GRAPH\_READER, classOf\[GraphSONInputFormat\].getName) inputGraphConf.setProperty(Constants.GREMLIN\_HADOOP\_INPUT\_LOCATION, HDFSFilePath) inputGraphConf.setProperty(Constants.MAPREDUCE\_INPUT\_FILEINPUTFORMAT_INPUTDIR, HDFSFilePath) inputGraphConf }

def getSc(sparkHost:String ,isRemote:Boolean): SparkContext ={ var sparkConf = new SparkConf() if(isRemote){ //To Improve} else{sparkConf.setMaster ("local\[*]"). setAppName ("GraphSon2GraphX")} Val SC = new SparkContext (sparkConf) SC}

def getJavaRDD(conf : BaseConfiguration, sc : SparkContext): JavaPairRDD\[AnyRef, VertexWritable\] ={ val jsc = JavaSparkContext.fromSparkContext(sc) val graphRDDInput = new InputFormatRDD val vertexWritableJavaPairRDD = graphRDDInput.readGraphRDD(conf, jsc) vertexWritableJavaPairRDD }

def getVertexRDD(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\]): RDD\[(Long,util.HashMap\[String,java.io.Serializable\])\] ={ vertexWritableJavaPairRDD.rdd.map((tuple2: Tuple2\[AnyRef, VertexWritable\]) => { // Get the center vertex val v = tuple2._2.get val g = StarGraph.of(v) // In case the vertex id in TinkerGraph is not long type // val vid = convertStringIDToLongID([v.id](http://v.id)().toString) val vid = [v.id](http://v.id)().toString.toLong // Pass the vertex properties to GraphX vertex value map and remain the original vertex id var graphxValueMap : util.HashMap\[String,java.io.Serializable\] = new util.HashMapString,java.io.Serializable graphxValueMap.put("originalID",[v.id](http://v.id)().toString) graphxValueMap.putAll(g.traversal.V([v.id](http://v.id)).valueMap().next(1).get(0)) (vid,graphxValueMap) }) }

def getEdgeRDD(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\]): RDD\[graphx.Edge\[util.HashMap\[String, java.io.Serializable\]\]\] ={ val edge = vertexWritableJavaPairRDD.rdd.flatMap((tuple2: Tuple2\[AnyRef, VertexWritable\]) => { val v = tuple2._2.get val g = StarGraph.of(v) val edgelist:util.List\[Edge\] = g.traversal.V([v.id](http://v.id)).outE().toList

  // Put all edges of the center vertex into the list
  val list = new collection.mutable.ArrayBuffer[graphx.Edge[util.HashMap[String,java.io.Serializable]]]()
  var x = 0
  for(x <- 0 until edgelist.size()){
    var srcId = edgelist.get(x).inVertex.id().toString
    var dstId = edgelist.get(x).outVertex.id().toString
    //        val md1 = convertStringIDToLongID(srcId)
    //        val md2 = convertStringIDToLongID(dstId)
    val md1 = srcId.toLong
    val md2 = dstId.toLong
    // Get the properties of the edge
    var edgeAttr = new util.HashMap[String,java.io.Serializable]()
    var perporties : util.Iterator[Property[Nothing]] = edgelist.get(x).properties()
    while(perporties.hasNext){
      val property = perporties.next()
      edgeAttr.put(property.key(),property.value().toString)
    }
    list.append(graphx.Edge(md1,md2,edgeAttr))
  }
  list
})
val edgeRDD = edge.distinct()
edgeRDD

}

def doLAP(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\], iterationNum : Int): Array\[Array\[String\]\] = { val vertexRDD = getVertexRDD(vertexWritableJavaPairRDD)

val edgeRDD = getEdgeRDD(vertexWritableJavaPairRDD)

val graph = graphx.Graph[util.HashMap[String,java.io.Serializable],
  util.HashMap[String,java.io.Serializable]](vertexRDD,edgeRDD,new util.HashMap[String,java.io.Serializable]())

val LVMRsult = lib.LabelPropagation.run(graph , iterationNum).vertices.collect.sortWith (_._1 < _._1).map(f => {
  println(f.toString())
  f})
getFinalCommunit(LVMRsult)

}

def getFinalCommunit(LVMRsult:Array\[(Long,Long)\]): Array\[Array\[String\]\] ={ var result = new Array[Array\[String\]](LVMRsult.length) var tmp = new ArrayBufferString for(i <- 0 until LVMRsult.length){ var k = 0 val array = new ArrayBufferString
  //Community contains multiple values
  for(j &lt;- (i+1) until LVMRsult.length) {
    if(LVMRsult(i)._2.equals(LVMRsult(j)._2)){
      if(!tmp.contains(LVMRsult(i)._1.toString)){
        array += LVMRsult(i)._1.toString
        tmp += LVMRsult(i)._1.toString
      }
      if(!tmp.contains(LVMRsult(j)._1.toString)){
        array += LVMRsult(j)._1.toString
        tmp += LVMRsult(j)._1.toString
      }
      k = k+1
    }
  }

  //Being a Community
  if(k.equals(0)){
    if(!tmp.contains(LVMRsult(i)._1.toString)){
      array += LVMRsult(i)._1.toString
      tmp += LVMRsult(i)._1.toString
    }

  }
  if(array.length &gt; 0){
    result.update(i,array.toArray.distinct)
  }
}
result.filter(f =&gt; {
  println(if (f.length &gt;0) f.mkString("(",",",")"))
  f != null
})

}

def doPageRank(vertexWritableJavaPairRDD : JavaPairRDD\[AnyRef, VertexWritable\], stopThreshold : Double): Array\[Array\[Any\]\] = { val vertexRDD:RDD\[(Long,util.HashMap\[String,java.io.Serializable\])\] = getVertexRDD(vertexWritableJavaPairRDD)


val edgeRDD = getEdgeRDD(vertexWritableJavaPairRDD)

val graph = graphx.Graph[util.HashMap[String,java.io.Serializable],
  util.HashMap[String,java.io.Serializable]](vertexRDD,edgeRDD,new util.HashMap[String,java.io.Serializable]())
val gpgraph = graph.pageRank(stopThreshold).cache()

val titleAndPrGraph = graph.outerJoinVertices(gpgraph.vertices) {
  (v, title, rank) =&gt; (rank.getOrElse(0.0), title)
}

//Reversed false positive order true

// titleAndPrGraph.vertices.sortBy((entry: (VertexId, (Double, Object))) => entry.\_2.\_1, false).foreach(f => println(f.\_1+":"+f.\_2._1))

val pageRank = titleAndPrGraph.vertices.sortBy((entry: (VertexId, (Double, Object))) =&gt; entry._2._1, false).map(f =&gt; {
  println(f._1+":"+f._2._1)
  Array(f._1.toString,f._2._1)
})
pageRank.collect()

}

}

Thus, janusgraph and graphx run through, calling graphx's rich graph calculation function is unimpeded, that is, the implementation is a little frustrated, hoping to throw a brick and start a work

Posted by lovelys on Thu, 07 Nov 2019 09:49:15 -0800