Business scenario
Spark knows how to read files according to InputFormat. By default, it reads files according to one line. In some specific cases, Spark's default InputFormat does not work very well. In one requirement of the company, it encounters the following problem. Most ETL cleans data every day. The file format is as follows: 1 json string is written to HDFS file
{"queryType":"BASIC","searchParameters":{"typeName": "hive_column","excludeDeletedEntities": false,"includeClassificationAttributes": false,"includeSubTypes": true,"limit": 100,"offset": 0},"entities":{"0":[{"typeName": "hive_column","attributes":[{"owner":"bigdata","createTime":"2019-08-20","qualifiedName":"default.em_join.name@primary","name":"em1"}],"guid":"9bf716ce-ba58-4d97-b5bd-d4cef06207b9", "status":"ACTIVE","displayText":"name","classificationNames":[]}]}} }
But because the json string is too long and too long to fit a line at all, they change the data and the data above becomes the following.
{"queryType":"BASIC","searchParameters":{"typeName": "hive_column","excludeDeletedEntities": false,"includeClassificationAttributes": false,"includeSubTypes": true,"limit": 100,"offset": 0},"entities":{"0":[{"typeName": "hive_column","attributes":[{"owner":"bigdata","createTime":"2019-08-20","qualifiedName":"default.em_join.name@primary","name":"em1"}],"guid":"9bf716ce-ba58-4d97-b5bd-d4cef06207b9", "status":"ACTIVE","displayText":"name","classificationNames":[]}]}} {"queryType":"BASIC","searchParameters":{"typeName": "hive_column","excludeDeletedEntities": false,"includeClassificationAttributes": false,"includeSubTypes": true,"limit": 100,"offset": 0},"entities":{"0":[{"typeName": "hive_column","attributes":[{"owner":"bigdata","createTime":"2019-08-20","qualifiedName":"default.em_join.name@primary","name":"em1"}],"guid":"9bf716ce-ba58-4d97-b5bd-d4cef06207b9\ ", "status":"ACTIVE","displayText":"name","classificationNames":[]}]}}
In this case, using the default reading mode of spark will generate three-day data, but in fact there are only two data in the example.
At this point, we need to customize InputFormat to implement the above functions.
Implementation ideas
spark provides us with relevant API s for HDFS, which we can use directly. It is more convenient than hadoop's MR program.
Customize InputFormat to implement FileInputFormat and redefine RecordReader
Here's the code section
Main method
def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Spark_Hadoop_InputFormat") conf.registerKryoClasses(Array[Class[_]](Class.forName("org.apache.hadoop.io.LongWritable"), Class.forName("org.apache.hadoop.io.Text"))) val sc: SparkContext = new SparkContext(conf) val configuration: Configuration = new Configuration(sc.hadoopConfiguration) val path = "hdfs://hadoop01:9000/test/text.json" val rdd: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile[LongWritable, Text, MyFileInputFormat](path, classOf[MyFileInputFormat], classOf[LongWritable], classOf[Text], configuration) // println(rdd.mapValues(t => t.toString).collect.mkString(",")) val result: RDD[Option[Any]] = rdd.mapValues(t => t.toString).map { case (k, v) => JSON.parseFull(v) } result.foreach(_ match { case Some(map: Map[String, Any]) => println(map) case None => println("Conversion error") case _ => println("System exception") }) sc.stop(); } }
Customize InputFormat
/** * Implementing custom InputFormat inheritance FileInputFormat * Rewriting the createRecordReader method */ class MyFileInputFormat extends FileInputFormat[LongWritable, Text] { override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[LongWritable, Text] = new MyRecordReader() }
Rewrite RecordReaderre
class MyRecordReader() extends RecordReader[LongWritable, Text] { var start, end, position, pos = 0L var reader: LineReader = null var key = new LongWritable var value = new Text override def initialize(inputSplit: InputSplit, context: TaskAttemptContext): Unit = { //Create FileSplit val split = inputSplit.asInstanceOf[FileSplit] //Get the header of the file to start reading start = 0.max(split.getStart.toInt - 1) //The end of the size of the retrieved file is equal to end end = start + split.getLength //Open FileSystem to create Steam objects for file reading val stream = split.getPath.getFileSystem(context.getConfiguration) .open(split.getPath) stream.seek(start) //Creating LineReader Objects reader = new LineReader(stream, context.getConfiguration) // if the split starts at a newline, we want to start yet another byte earlier to check if the newline was escaped or not // val firstByte = stream.readByte().toInt //If the first li n e read isn if (firstByte == '\n') start = 0.max(start.toInt - 1) stream.seek(start) if (start != 0) skipRemainderFromPreviousSplit(reader) } def skipRemainderFromPreviousSplit(reader: LineReader): Unit = { var readAnotherLine = true while (readAnotherLine) { // read next line val buffer = new Text() start += reader.readLine(buffer, Integer.MAX_VALUE, Integer.MAX_VALUE) pos = start // detect if delimiter was escaped readAnotherLine = buffer.getLength >= 1 && // something was read buffer.charAt(buffer.getLength - 1) == '\\' && // newline was escaped pos <= end // seek head hasn't passed the split } } override def nextKeyValue(): Boolean = { key.set(pos) // read newlines until an unescaped newline is read var lastNewlineWasEscaped = false while (pos < end || lastNewlineWasEscaped) { // read next line val buffer = new Text pos += reader.readLine(buffer, Integer.MAX_VALUE, Integer.MAX_VALUE) // append newly read data to previous data if necessary value = if (lastNewlineWasEscaped) new Text(value + "\n" + buffer) else buffer // detect if delimiter was escaped lastNewlineWasEscaped = buffer.charAt(buffer.getLength - 1) == '\\' // let Spark know that a key-value pair is ready! if (!lastNewlineWasEscaped) return true } // end of split reached? return false } override def getCurrentKey: LongWritable = { key } override def getProgress: Float = ??? override def getCurrentValue: Text = { value; } override def close(): Unit = { if (reader != null) reader.close() } }